Beispiel #1
0
    def find_match(self, df):
        """Find a taxId that best describes top hits.

        Parameters
        ----------
        df : pd.DataFrame
            hit table

        Returns
        -------
        str
            taxId of match, or '0' if not found

        Notes
        -----
        The best match TaxID is the LCA of top hits. The "top hits" are
        defined as those whose bit scores are no less than a certain
        percentage of that of the best hit. This behavior is similar to
        DIAMOND's taxonomic classification function.
        """
        try:
            th = df.iloc[0]['score'] * self.match_th
        except IndexError:
            return '0'
        return find_lca(df[df['score'] >= th]['taxid'], self.taxdump)
Beispiel #2
0
    def infer_close_group(self):
        """Infer close group automatically.

        Notes
        -----
        1. Assign `close_tax` as top-level taxId(s) of the close group.
        2. Assign `groups['close']` as all taxIds under the close group.
        """
        mems = []

        # start from the LCA of self group
        cid = find_lca(self.self_tax, self.taxdump)
        while True:

            # close group should exclude self group
            mems = set([cid] + get_descendants(cid, self.taxdump)).difference(
                self.groups['self'])

            # stop when size limit is reached
            if mems and (not self.close_size or len(mems) >= self.close_size):
                break

            # move up one level
            pid = self.taxdump[cid]['parent']
            if pid == cid or pid == '0':
                break
            cid = pid
        self.close_tax = [cid]
        self.groups['close'] = mems
Beispiel #3
0
    def assign_taxonomy(self):
        """Assign taxonomy to genomes.
        """
        # take user-defined taxIds of input genomes
        if self.input_tax:
            try:
                self.input_tax = dict_from_param(self.input_tax)
            except ValueError:
                if len(self.data) > 1:
                    raise ValueError('Invalid input taxonomy format.')
                # for single-sample analysis, one can simply enter a taxId
                self.input_tax = {max(self.data.keys()): self.input_tax}
            print('User-specified TaxIDs of input genomes:')
            for sid, tid in sorted(self.input_tax.items()):
                if tid not in self.taxdump:
                    # TODO: read from both temp and master taxdump
                    raise ValueError('TaxID {} is not present in taxonomy '
                                     'database.'.format(tid))
                print('  {}: {} ({}).'.format(sid, tid,
                                              self.taxdump[tid]['name']))
        else:
            self.input_tax = {}

        # auto-infer taxIds of remaining genomes
        sids = sorted([x for x in self.data if x not in self.input_tax])
        if sids:
            print('Auto-inferring plausible taxIds for input genomes based on '
                  'taxonomy of search results...')
            for sid in sids:
                try:
                    tid, cov = self.infer_genome_tax(self.data[sid],
                                                     self.taxdump,
                                                     self.input_cov)
                    self.input_tax[sid] = tid
                except ValueError:
                    raise ValueError('Cannot auto-infer taxonomy for {}. '
                                     'Please specify manually.'.format(sid))
                print('  {}: {} ({}) (covering {:2g}% best hits).'.format(
                    sid, tid, self.taxdump[tid]['name'], cov))

        # refine taxonomy database
        print('Refining taxonomy database...')
        refine_taxdump(self.sum_taxids(), self.taxdump)
        add_children(self.taxdump)
        print('Done. Retained {} taxa.'.format(len(self.taxdump)))

        # find lowest common ancestor (LCA) of all genomes
        self.lca = find_lca(self.input_tax.values(), self.taxdump)
        print('All input genomes belong to {} ({}).'.format(
            self.lca, describe_taxon(self.lca, self.taxdump)))
Beispiel #4
0
    def build_taxonmap(self):
        """Build protein-to-taxonomy map.
        """
        # assign shared protein to lowest common ancestor (LCA)
        self.taxonmap = {
            p: max(tids) if len(tids) == 1 else find_lca(tids, self.taxdump)
            for p, tids in self.p2tids.items()
        }

        # write taxonomy map
        fname = 'taxon.map.gz'
        with gzip.open(join(self.output, fname), 'wb') as f:
            for p, tid in sorted(self.taxonmap.items()):
                f.write(f'{p}\t{tid}\n'.encode())
        print(f'Protein-to-taxonomy map written to {fname}.')
    def test_find_lca(self):
        taxdump = taxdump_from_text(taxdump_archaea)

        self.assertEqual(find_lca(['131567'], taxdump), '131567')
        self.assertEqual(find_lca(['1935183', '1783276'], taxdump), '2157')
        self.assertEqual(find_lca(['1935183', '1783276', '1655434'], taxdump),
                         '2157')
        self.assertEqual(find_lca(['1935183', '1783276', '2157'], taxdump),
                         '2157')
        self.assertEqual(find_lca(['1935183', '2'], taxdump), '131567')
        self.assertEqual(find_lca(['1', '2', '1'], taxdump), '1')

        taxdump['x'] = {'name': 'x', 'parent': 'x'}
        with self.assertRaises(ValueError) as ctx:
            find_lca(['2', 'x'], taxdump)
        msg = 'Cannot find LCA of taxIds in database.'
        self.assertEqual(str(ctx.exception), msg)