def _get_from_card(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) versions = self._get_card_versions('download.html') if self.version is not None: key = tuple([int(x) for x in self.version.split('.')]) if key not in versions: raise Error('Error! Did not find requested version ' + self.version) else: key = sorted(list(versions.keys()))[-1] self.version = '.'.join([str(x) for x in key]) print('Getting version', self.version) card_tarball_url = versions[key] card_tarball = 'card.tar.bz2' print('Working in temporary directory', tmpdir) print('Downloading data from card:', card_tarball_url, flush=True) common.syscall('wget -O ' + card_tarball + ' ' + card_tarball_url, verbose=True) print('...finished downloading', flush=True) if not tarfile.is_tarfile(card_tarball): raise Error( 'File ' + card_tarball + ' downloaded from ' + card_tarball_url + ' does not look like a valid tar archive. Cannot continue') json_file = './card.json' with tarfile.open(card_tarball, 'r') as tfile: tfile.extract(json_file) print('Extracted json data file ', json_file, '. Reading its contents...', sep='') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' log_file = outprefix + '.log' f_out_fa = pyfastaq.utils.open_file_write(final_fasta) f_out_tsv = pyfastaq.utils.open_file_write(final_tsv) f_out_log = pyfastaq.utils.open_file_write(log_file) with open(json_file) as f: json_data = json.load(f) json_data = { int(x): json_data[x] for x in json_data if not x.startswith('_') } print('Found', len(json_data), 'records in the json file. Analysing...', flush=True) for gene_key, gene_dict in sorted(json_data.items()): crecord = card_record.CardRecord(gene_dict) data = crecord.get_data() data['ARO_description'] = data['ARO_description'].encode('utf-8') fasta_name_prefix = '.'.join([ card_record.CardRecord._ARO_name_to_fasta_name( data['ARO_name']), data['ARO_accession'], ]) for card_key, gi, genbank_id, start, end, dna_seq, protein_seq in data[ 'dna_seqs_and_ids']: if dna_seq == '': print('Empty dna sequence', gene_key, data['ARO_id'], data['ARO_accession'], sep='\t', file=f_out_log) continue fasta_id = '.'.join([ fasta_name_prefix, genbank_id, start + '-' + end, card_key ]) fasta = pyfastaq.sequences.Fasta(fasta_id, dna_seq) if gi != 'NA': gene_tuple = fasta.make_into_gene() if gene_tuple is None: print('Could not make gene from sequence', fasta.id, sep='\t', file=f_out_log) continue else: translated = gene_tuple[0].translate() if gene_tuple[0][:3] in pyfastaq.genetic_codes.starts[ self.genetic_code]: translated.seq = 'M' + translated.seq[1:] if translated.seq[:-1] != protein_seq: print( 'Translation of inferred gene dna sequence does not match protein sequence', fasta.id, sep='\t', file=f_out_log) continue print(fasta, file=f_out_fa) if gi == 'NA': gene_or_not = '0' variant_only = '0' elif len(data['snps']) == 0: gene_or_not = '1' variant_only = '0' else: gene_or_not = '1' variant_only = '1' print(fasta.id, gene_or_not, variant_only, '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv) if len(data['snps']) == 0 and data['ARO_description'] != '': print(fasta.id, gene_or_not, variant_only, '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv) else: for snp in data['snps']: if data['ARO_description'] != '': print(fasta.id, gene_or_not, variant_only, snp, '.', data['ARO_description'], sep='\t', file=f_out_tsv) pyfastaq.utils.close(f_out_fa) pyfastaq.utils.close(f_out_tsv) pyfastaq.utils.close(f_out_log) os.chdir(current_dir) if not self.debug: common.rmtree(tmpdir) print('Extracted data and written ARIBA input files\n') print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print( '"CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database", Alcock et al 2020, PMID: 31665441' ) print('and in your methods say that version', self.version, 'of the database was used')
def test_get_data(self): d = { 'ARO_id': '123', 'ARO_accession': '1234567', 'ARO_name': 'ARO_name1', 'ARO_description': 'ARO description that we want.', 'model_id': '1', 'model_name': 'Model_name1', 'model_type': 'protein homolog model', 'model_type_id': '12345', 'model_description': 'Models to detect proteins conferring antibiotic resistance, which include a reference protein sequence and a curated BLASTP cut-off.', 'model_sequences': { 'sequence': { '1234': { 'protein_sequence': { 'sequence': 'MCDE*', 'GI': '229597524' }, 'dna_sequence': { 'sequence': 'ATGTGCGATGAATAA', 'strand': '+', 'fmax': '1194', 'fmin': '0', 'accession': 'XX0000001' }, 'NCBI_taxonomy': { 'NCBI_taxonomy_cvterm_id': '234567', 'NCBI_taxonomy_id': '42', 'NCBI_taxonomy_name': 'Genus1 species1' } } } }, 'model_param': { 'blastp_evalue': {} # we're ignoring this, so make it empty for tests to save a few lines }, 'ARO_category': { '36696': { 'category_aro_description': 'Enzyme that catalyzes the inactivation of an antibiotic resulting in resistance. Inactivation includes chemical modification, destruction, etc.', 'category_aro_cvterm_id': '36696', 'category_aro_accession': '3000557', 'category_aro_name': 'antibiotic inactivation enzyme' }, '36268': { 'category_aro_description': 'Genes conferring resistance to beta-lactams.', 'category_aro_cvterm_id': '36268', 'category_aro_accession': '3000129', 'category_aro_name': 'beta-lactam resistance gene' } }, } expected = { 'ARO_id': '123', 'ARO_accession': '1234567', 'ARO_name': 'ARO_name1', 'ARO_description': 'ARO description that we want.', 'dna_seqs_and_ids': [( '1234', '229597524', 'XX0000001', '0', '1194', 'ATGTGCGATGAATAA', 'MCDE*' )], 'snps': set(), } record = card_record.CardRecord(d) got = record.get_data() self.assertEqual(expected, got) d['model_param'] = { 'snp': { 'param_value': { '1': 'I42L', '2': 'S100T', } } } expected['snps'] = {'I42L', 'S100T'} record = card_record.CardRecord(d) got = record.get_data() self.assertEqual(expected, got)