Beispiel #1
0
    def _get_from_card(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        versions = self._get_card_versions('download.html')
        if self.version is not None:
            key = tuple([int(x) for x in self.version.split('.')])
            if key not in versions:
                raise Error('Error! Did not find requested version ' +
                            self.version)
        else:
            key = sorted(list(versions.keys()))[-1]
            self.version = '.'.join([str(x) for x in key])

        print('Getting version', self.version)
        card_tarball_url = versions[key]
        card_tarball = 'card.tar.bz2'
        print('Working in temporary directory', tmpdir)
        print('Downloading data from card:', card_tarball_url, flush=True)
        common.syscall('wget -O ' + card_tarball + ' ' + card_tarball_url,
                       verbose=True)
        print('...finished downloading', flush=True)
        if not tarfile.is_tarfile(card_tarball):
            raise Error(
                'File ' + card_tarball + ' downloaded from ' +
                card_tarball_url +
                ' does not look like a valid tar archive. Cannot continue')

        json_file = './card.json'
        with tarfile.open(card_tarball, 'r') as tfile:
            tfile.extract(json_file)

        print('Extracted json data file ',
              json_file,
              '. Reading its contents...',
              sep='')

        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'
        log_file = outprefix + '.log'
        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)
        f_out_tsv = pyfastaq.utils.open_file_write(final_tsv)
        f_out_log = pyfastaq.utils.open_file_write(log_file)

        with open(json_file) as f:
            json_data = json.load(f)

        json_data = {
            int(x): json_data[x]
            for x in json_data if not x.startswith('_')
        }
        print('Found',
              len(json_data),
              'records in the json file. Analysing...',
              flush=True)

        for gene_key, gene_dict in sorted(json_data.items()):
            crecord = card_record.CardRecord(gene_dict)
            data = crecord.get_data()
            data['ARO_description'] = data['ARO_description'].encode('utf-8')
            fasta_name_prefix = '.'.join([
                card_record.CardRecord._ARO_name_to_fasta_name(
                    data['ARO_name']),
                data['ARO_accession'],
            ])

            for card_key, gi, genbank_id, start, end, dna_seq, protein_seq in data[
                    'dna_seqs_and_ids']:
                if dna_seq == '':
                    print('Empty dna sequence',
                          gene_key,
                          data['ARO_id'],
                          data['ARO_accession'],
                          sep='\t',
                          file=f_out_log)
                    continue

                fasta_id = '.'.join([
                    fasta_name_prefix, genbank_id, start + '-' + end, card_key
                ])
                fasta = pyfastaq.sequences.Fasta(fasta_id, dna_seq)

                if gi != 'NA':
                    gene_tuple = fasta.make_into_gene()
                    if gene_tuple is None:
                        print('Could not make gene from sequence',
                              fasta.id,
                              sep='\t',
                              file=f_out_log)
                        continue
                    else:
                        translated = gene_tuple[0].translate()
                        if gene_tuple[0][:3] in pyfastaq.genetic_codes.starts[
                                self.genetic_code]:
                            translated.seq = 'M' + translated.seq[1:]

                        if translated.seq[:-1] != protein_seq:
                            print(
                                'Translation of inferred gene dna sequence does not match protein sequence',
                                fasta.id,
                                sep='\t',
                                file=f_out_log)
                            continue

                print(fasta, file=f_out_fa)

                if gi == 'NA':
                    gene_or_not = '0'
                    variant_only = '0'
                elif len(data['snps']) == 0:
                    gene_or_not = '1'
                    variant_only = '0'
                else:
                    gene_or_not = '1'
                    variant_only = '1'

                print(fasta.id,
                      gene_or_not,
                      variant_only,
                      '.',
                      '.',
                      data['ARO_name'],
                      sep='\t',
                      file=f_out_tsv)

                if len(data['snps']) == 0 and data['ARO_description'] != '':
                    print(fasta.id,
                          gene_or_not,
                          variant_only,
                          '.',
                          '.',
                          data['ARO_description'],
                          sep='\t',
                          file=f_out_tsv)
                else:
                    for snp in data['snps']:
                        if data['ARO_description'] != '':
                            print(fasta.id,
                                  gene_or_not,
                                  variant_only,
                                  snp,
                                  '.',
                                  data['ARO_description'],
                                  sep='\t',
                                  file=f_out_tsv)

        pyfastaq.utils.close(f_out_fa)
        pyfastaq.utils.close(f_out_tsv)
        pyfastaq.utils.close(f_out_log)
        os.chdir(current_dir)
        if not self.debug:
            common.rmtree(tmpdir)

        print('Extracted data and written ARIBA input files\n')
        print('Finished. Final files are:',
              final_fasta,
              final_tsv,
              sep='\n\t',
              end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv,
              'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(
            '"CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database", Alcock et al 2020, PMID: 31665441'
        )
        print('and in your methods say that version', self.version,
              'of the database was used')
Beispiel #2
0
    def test_get_data(self):
        d = {
            'ARO_id': '123',
            'ARO_accession': '1234567',
            'ARO_name': 'ARO_name1',
            'ARO_description': 'ARO description that we want.',
            'model_id': '1',
            'model_name': 'Model_name1',
            'model_type': 'protein homolog model',
            'model_type_id': '12345',
            'model_description': 'Models to detect proteins conferring antibiotic resistance, which include a reference protein sequence and a curated BLASTP cut-off.',
            'model_sequences': {
                'sequence': {
                    '1234': {
                        'protein_sequence': {
                            'sequence': 'MCDE*',
                            'GI': '229597524'
                        },
                        'dna_sequence': {
                            'sequence': 'ATGTGCGATGAATAA',
                            'strand': '+',
                            'fmax': '1194',
                            'fmin': '0',
                            'accession': 'XX0000001'
                        },
                        'NCBI_taxonomy': {
                            'NCBI_taxonomy_cvterm_id': '234567',
                            'NCBI_taxonomy_id': '42',
                            'NCBI_taxonomy_name': 'Genus1 species1'
                        }
                    }
                }
            },
            'model_param': {
                'blastp_evalue': {} # we're ignoring this, so make it empty for tests to save a few lines
            },
            'ARO_category': {
                '36696': {
                    'category_aro_description': 'Enzyme that catalyzes the inactivation of an antibiotic resulting in resistance.  Inactivation includes chemical modification, destruction, etc.',
                    'category_aro_cvterm_id': '36696',
                    'category_aro_accession': '3000557',
                    'category_aro_name': 'antibiotic inactivation enzyme'
                },
                '36268': {
                    'category_aro_description': 'Genes conferring resistance to beta-lactams.',
                    'category_aro_cvterm_id': '36268',
                    'category_aro_accession': '3000129',
                    'category_aro_name': 'beta-lactam resistance gene'
                }
            },
        }

        expected = {
            'ARO_id': '123',
            'ARO_accession': '1234567',
            'ARO_name': 'ARO_name1',
            'ARO_description': 'ARO description that we want.',
            'dna_seqs_and_ids': [(
                '1234',
                '229597524',
                'XX0000001',
                '0',
                '1194',
                'ATGTGCGATGAATAA',
                'MCDE*'
            )],
            'snps': set(),
        }

        record = card_record.CardRecord(d)
        got = record.get_data()
        self.assertEqual(expected, got)

        d['model_param'] = {
            'snp': {
                'param_value': {
                    '1': 'I42L',
                    '2': 'S100T',
                }
            }
        }

        expected['snps'] = {'I42L', 'S100T'}
        record = card_record.CardRecord(d)
        got = record.get_data()
        self.assertEqual(expected, got)