def download_summary_file(self, ftp_server, summary_file_source): index = summary_file_source.rfind('/') summary_file = summary_file_source[index + 1:] ftp = FtpCli(ftp_server) if not ftp.is_up_to_date(summary_file_source, summary_file): ftp.get(summary_file_source, summary_file) ftp.close() return summary_file
def sync(self, genomes_found, success_file): fp = open(genomes_found, 'r', encoding='UTF-8') result_fp = open(success_file, 'w') for line in fp: line = line.rstrip() fields = line.split('\t') id = fields[0] url = fields[20] server = url.replace('ftp://', '') server = url.replace('https://', '') index = server.find('/') path = server[index:] server = server[0:index] index = path.rfind('/') name = path[index + 1:] if name.endswith('gz'): gz_file_name = name gz_file_path = path else: gz_file_name = name + '_protein.faa.gz' gz_file_path = f'{path}/{gz_file_name}' outfile = self.out_dir + '/' + gz_file_name ftp = FtpCli(server) ftp.sync(gz_file_path, outfile) ftp.close() print(id + '\t' + re.sub(r'.gz', '', outfile), file=result_fp, flush=True) fp.close() result_fp.close()
def __download_file(self, url, debug, file_obtained, id): with self.semaphore: server = url.replace('ftp://', '') index = server.find('/') path = server[index:] server = server[0:index] index = path.rfind('/') name = path[index + 1:] if name.endswith('gz'): gz_file_name = name gz_file_path = path else: gz_file_name = name + '_protein.faa.gz' gz_file_path = f'{path}/{gz_file_name}' outfile = self.output_folder + '/' + gz_file_name print(f'{id}\t{outfile}', flush=True) if not debug: ftp = FtpCli(server) if not ftp.is_up_to_date(gz_file_path, outfile): ftp.get(gz_file_path, outfile) ftp.close() file_obtained[id] = re.sub(r'.gz', '', outfile)
#!/usr/bin/env python3 import sys import argparse from classes.FtpCli import FtpCli parser = argparse.ArgumentParser(description='submit FTP command') parser.add_argument('path', help='file path on the server') parser.add_argument('--list', action='store_true', help='LIST') parser.add_argument('--ls', action='store_true', help='ls') args = parser.parse_args() path = args.path.replace('ftp://', '') pos = path.find('/') server = path[0:pos] path = path[pos:] print(f'server: {server}', file=sys.stderr) print(f'path: {path}', file=sys.stderr) cli = FtpCli(server) if args.list: print(cli.ftp.retrlines(f'LIST {path}'), file=sys.stderr) elif args.ls: print(cli.ftp.nlst(path)) cli.ftp.close()
import re import argparse import subprocess from classes.FtpCli import FtpCli from functions.find_gcf_file import find_gcf parser = argparse.ArgumentParser(description='Download genomes from NCBI, according to the organism list in tsv format.') parser.add_argument('organism_list', help='Organism list in tsv format') parser.add_argument('-o', '--outdir', default='data', help='Output directory') args = parser.parse_args() out_dir = f'{args.outdir}/genomes' if not os.path.exists(out_dir): os.makedirs(out_dir) ftp = FtpCli('ftp.ncbi.nlm.nih.gov') ftp_dir = '/genomes/ASSEMBLY_REPORTS/' file_name = 'assembly_summary_refseq.txt' summary_file = args.outdir + '/' + file_name ftp.sync(ftp_dir + file_name, summary_file) ftp.close() found = find_gcf(args.organism_list, summary_file) with open(f'{args.outdir}/genomes_found.tsv', 'w') as fp: for no in sorted(found.keys(), key=int): print(no, found[no], sep='\t', file=fp) def parse_url(url): # return server, path, filename url = url.replace('ftp://', '').replace('https://', '') dir_begin = url.find('/')