def _get_from_vfdb_common(self, outprefix, filename, info_text): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' try: os.mkdir(tmpdir) except: raise Error('Error mkdir ' + tmpdir) zipfile = os.path.join(tmpdir, filename) common.download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) print('Extracting files ... ', end='', flush=True) vparser = vfdb_parser.VfdbParser(zipfile, outprefix) vparser.run() if not self.debug: common.rmtree(tmpdir) print('done') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' print('Extracted core DNA sequence dataset and metadata. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
def _get_card_versions(self, tmp_file): print('Getting available CARD versions') common.download_file('https://card.mcmaster.ca/download', tmp_file, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) p = re.compile( r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.(gz|bz2))"''' ) versions = {} with open(tmp_file) as f: for line in f: got = p.findall(line) for match in got: key = tuple([int(x) for x in match[1].split('.')]) versions[key] = 'https://card.mcmaster.ca' + match[0] if len(versions) == 0: raise Error('Error getting CARD versions. Cannot continue') print('Found versions:') for key, url in sorted(versions.items()): print('.'.join([str(x) for x in key]), url, sep='\t') os.unlink(tmp_file) return versions
def _get_from_vfdb_common(self, outprefix, filename, info_text): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' try: os.mkdir(tmpdir) except: raise Error('Error mkdir ' + tmpdir) zipfile = os.path.join(tmpdir, filename) common.download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) print('Extracting files ... ', end='', flush=True) vparser = vfdb_parser.VfdbParser(zipfile, outprefix) vparser.run() if not self.debug: shutil.rmtree(tmpdir) print('done') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' print('Extracted core DNA sequence dataset and metadata. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
def _get_from_argannot(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'arg-annot-database_doc.zip' common.download_file( 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) common.syscall('unzip ' + zipfile) os.chdir(current_dir) print('Extracted files.') genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' seq_reader = pyfastaq.sequences.file_reader(genes_file) f_out_tsv = pyfastaq.utils.open_file_write(final_tsv) f_out_fa = pyfastaq.utils.open_file_write(final_fasta) for seq in seq_reader: original_id = seq.id seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0]) print(seq, file=f_out_fa) print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv) pyfastaq.utils.close(f_out_tsv) pyfastaq.utils.close(f_out_fa) if not self.debug: common.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print(argannot_ref)
def run(self): common.download_file(self.zip_url, self.zip_file, verbose=True) tmpdir = self.zip_file + '.tmp.extract' original_files = MegaresZipParser._extract_files(self.zip_file, tmpdir) annotation_data = MegaresZipParser._load_annotations_file(os.path.join(tmpdir, original_files['annotations'])) header_data = MegaresZipParser._load_header_mappings_file(os.path.join(tmpdir, original_files['header_mappings'])) sequences = {} pyfastaq.tasks.file_to_dict(os.path.join(tmpdir, original_files['fasta']), sequences) MegaresZipParser._write_files(self.outprefix, sequences, annotation_data, header_data) common.rmtree(tmpdir) os.unlink(self.zip_file)
def _get_from_argannot(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'arg-annot-database_doc.zip' common.download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) common.syscall('unzip ' + zipfile) os.chdir(current_dir) print('Extracted files.') genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' seq_reader = pyfastaq.sequences.file_reader(genes_file) f_out_tsv = pyfastaq.utils.open_file_write(final_tsv) f_out_fa = pyfastaq.utils.open_file_write(final_fasta) for seq in seq_reader: original_id = seq.id seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0]) print(seq, file=f_out_fa) print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv) pyfastaq.utils.close(f_out_tsv) pyfastaq.utils.close(f_out_fa) if not self.debug: shutil.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print(argannot_ref)
def _get_card_versions(self, tmp_file): print('Getting available CARD versions') common.download_file('https://card.mcmaster.ca/download', tmp_file, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) p = re.compile(r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.bz2)"''') versions = {} with open(tmp_file) as f: for line in f: got = p.findall(line) for match in got: key = tuple([int(x) for x in match[1].split('.')]) versions[key] = 'https://card.mcmaster.ca' + match[0] if len(versions) == 0: raise Error('Error getting CARD versions. Cannot continue') print('Found versions:') for key, url in sorted(versions.items()): print('.'.join([str(x) for x in key]), url, sep='\t') os.unlink(tmp_file) return versions