Exemple #1
0
 def download_summary_file(self, ftp_server, summary_file_source):
     index = summary_file_source.rfind('/')
     summary_file = summary_file_source[index + 1:]
     
     ftp = FtpCli(ftp_server)
     if not ftp.is_up_to_date(summary_file_source, summary_file):
         ftp.get(summary_file_source, summary_file)
     ftp.close()
     
     return summary_file
Exemple #2
0
 def sync(self, genomes_found, success_file):
     fp = open(genomes_found, 'r', encoding='UTF-8')
     result_fp = open(success_file, 'w')
     for line in fp:
         line = line.rstrip()
         fields = line.split('\t')
         id = fields[0]
         url = fields[20]
         server = url.replace('ftp://', '')
         server = url.replace('https://', '')
         index = server.find('/')
         path = server[index:]
         server = server[0:index]
         index = path.rfind('/')
         name = path[index + 1:]
         if name.endswith('gz'):
             gz_file_name = name
             gz_file_path = path
         else:
             gz_file_name = name + '_protein.faa.gz'
             gz_file_path = f'{path}/{gz_file_name}'
         outfile = self.out_dir + '/' + gz_file_name
         ftp = FtpCli(server)
         ftp.sync(gz_file_path, outfile)
         ftp.close()
         print(id + '\t' + re.sub(r'.gz', '', outfile),
               file=result_fp,
               flush=True)
     fp.close()
     result_fp.close()
Exemple #3
0
 def __download_file(self, url, debug, file_obtained, id):
     with self.semaphore:
         server = url.replace('ftp://', '')
         index = server.find('/')
         path = server[index:]
         server = server[0:index]
         index = path.rfind('/')
         name = path[index + 1:]
         if name.endswith('gz'):
             gz_file_name = name
             gz_file_path = path
         else:
             gz_file_name = name + '_protein.faa.gz'
             gz_file_path = f'{path}/{gz_file_name}'
         outfile = self.output_folder + '/' + gz_file_name
         print(f'{id}\t{outfile}', flush=True)
         if not debug:
             ftp = FtpCli(server)
             if not ftp.is_up_to_date(gz_file_path, outfile):
                 ftp.get(gz_file_path, outfile)
             ftp.close()
         file_obtained[id] = re.sub(r'.gz', '', outfile)
Exemple #4
0
#!/usr/bin/env python3
import sys
import argparse
from classes.FtpCli import FtpCli

parser = argparse.ArgumentParser(description='submit FTP command')
parser.add_argument('path', help='file path on the server')
parser.add_argument('--list', action='store_true', help='LIST')
parser.add_argument('--ls', action='store_true', help='ls')
args = parser.parse_args()

path = args.path.replace('ftp://', '')
pos = path.find('/')
server = path[0:pos]
path = path[pos:]

print(f'server: {server}', file=sys.stderr)
print(f'path: {path}', file=sys.stderr)

cli = FtpCli(server)

if args.list:
    print(cli.ftp.retrlines(f'LIST {path}'), file=sys.stderr)
elif args.ls:
    print(cli.ftp.nlst(path))

cli.ftp.close()
Exemple #5
0
import re
import argparse
import subprocess
from classes.FtpCli import FtpCli
from functions.find_gcf_file import find_gcf

parser = argparse.ArgumentParser(description='Download genomes from NCBI, according to the organism list in tsv format.')
parser.add_argument('organism_list', help='Organism list in tsv format')
parser.add_argument('-o', '--outdir', default='data', help='Output directory')
args = parser.parse_args()

out_dir = f'{args.outdir}/genomes'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

ftp = FtpCli('ftp.ncbi.nlm.nih.gov')
ftp_dir = '/genomes/ASSEMBLY_REPORTS/'
file_name = 'assembly_summary_refseq.txt'
summary_file = args.outdir + '/' + file_name
ftp.sync(ftp_dir + file_name, summary_file)
ftp.close()

found = find_gcf(args.organism_list, summary_file)
with open(f'{args.outdir}/genomes_found.tsv', 'w') as fp:
    for no in sorted(found.keys(), key=int):
        print(no, found[no], sep='\t', file=fp)

def parse_url(url):
    # return server, path, filename
    url = url.replace('ftp://', '').replace('https://', '')
    dir_begin = url.find('/')