Esempio n. 1
0
def test_build_tree_4():
    tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')],
                      ])

    tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')],
                      ], tree)

    assert tree == { LineagePair('rank1', 'name1'): { LineagePair('rank2', 'name2a') : {},
                                           LineagePair('rank2', 'name2b') : {}} }
Esempio n. 2
0
def test_find_lca_3():
    lin1 = lca_utils.make_lineage('a;b;c')
    lin2 = lca_utils.make_lineage('a;b')

    tree = build_tree([lin1, lin2])
    lca, reason = find_lca(tree)
    assert lca == lin1                    # find most specific leaf node
Esempio n. 3
0
def test_find_lca_2():
    tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')],
                       [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')],
                      ])
    lca = find_lca(tree)

    assert lca == ((LineagePair('rank1', 'name1'),), 2)
def main():
    p = argparse.ArgumentParser()
    p.add_argument('node_mh_pickle')
    p.add_argument('lca_db')
    args = p.parse_args()

    node_mhs = pickle.load(open(args.node_mh_pickle, 'rb'))
    lca_obj = LCA_Database()
    lca_obj.load(args.lca_db)
    databases = ((lca_obj, args.lca_db, 'LCA'),)

    d = {}
    n_pure95 = 0
    total = 0

    for k, v in node_mhs.items():
        ss = sourmash.SourmashSignature(v)

        results = [ x[0] for x in gather_databases(ss, databases, 0, True) ]
        sum_f_uniq = sum([result.f_unique_to_query for result in results])

        keep_results = []
        for result in results:
            if result.f_unique_to_query < 0.10:
                break
            keep_results.append(result)

        if not keep_results:
            print('** no match for {}'.format(k))
            continue

        idents = [ result.name.split()[0].split('.')[0] for result in keep_results ]
        idxlist = [ lca_obj.ident_to_idx[ident] for ident in idents ]
        lidlist = [ lca_obj.idx_to_lid[idx] for idx in idxlist ]
        lineages = [ lca_obj.lid_to_lineage[lid] for lid in lidlist ]

        tree = lca_utils.build_tree(lineages)
        lca, reason = lca_utils.find_lca(tree)

        level = '*none*'
        if lca:
            level = lca[-1].rank

        lineage = ";".join(lca_utils.zip_lineage(lca, truncate_empty=True))

        this_f_uniq = sum([ result.f_unique_to_query for result in keep_results ])

        print('node {} matches {} @ {:.1f}'.format(k, level, this_f_uniq / sum_f_uniq * 100))

        if level in ('strain', 'genus', 'species') and this_f_uniq / sum_f_uniq >= 0.95:
            n_pure95 += 1
        total += 1

    print('XXX', n_pure95, total)
Esempio n. 5
0
def make_lca_counts(dblist,
                    lowest_rank='phylum',
                    min_num=0,
                    min_hashes=5,
                    prefix='oddities'):
    """
    Collect counts of all the LCAs in the list of databases.
    """
    assert len(dblist) == 1

    keep_ranks = ['root']
    for rank in lca_utils.taxlist():
        keep_ranks.append(rank)
        if rank == lowest_rank:
            break
    print('keeping hashvals at following ranks:', keep_ranks)
    print('min number of lineages:', min_num)
    print('min number of shared hashes:', min_hashes)

    print('---')

    # gather all hashvalue assignments from across all the databases
    assignments = defaultdict(set)
    for lca_db in dblist:
        for hashval, idx_list in lca_db.hashval_to_idx.items():
            if min_num and len(idx_list) < min_num:
                continue

            for idx in idx_list:
                lid = lca_db.idx_to_lid.get(idx)
                if lid is not None:
                    lineage = lca_db.lid_to_lineage[lid]
                    assignments[hashval].add(lineage)

    # now convert to trees -> do LCA & counts
    counts = defaultdict(int)
    mixdict = defaultdict(set)
    for hashval, lineages in assignments.items():

        # for each list of tuple_info [(rank, name), ...] build
        # a tree that lets us discover lowest-common-ancestor.
        debug("{}", lineages)
        tree = lca_utils.build_tree(lineages)

        # now find either a leaf or the first node with multiple
        # children; that's our lowest-common-ancestor node.
        lca, reason = lca_utils.find_lca(tree)

        # find cross-superkingdom hashes, and record combinations of lineages
        # that have them.
        rank = 'root'
        if lca:
            rank = lca[-1].rank

        if rank in keep_ranks:
            xx = []
            for lineage in lineages:
                xx.append(tuple(lineage))
            xx = tuple(xx)

            mixdict[xx].add(hashval)

        counts[lca] += 1

    # sort on number of confused hash vals by combination of lineages.
    mixdict_items = list(mixdict.items())
    mixdict_items.sort(key=lambda x: -len(x[1]))

    confused_hashvals = set()

    fp = open(prefix + '.csv', 'wt')
    w = csv.writer(fp)
    w.writerow([
        'cluster', 'num_lineages', 'shared_kmers', 'ksize', 'rank', 'lca',
        'ident1', 'lineage1', 'ident2', 'lineage2'
    ])

    #
    # find candidate lineages, then evaluate pairwise intersections.
    #

    for cluster_n, (lineages, hashvals) in enumerate(mixdict_items):
        # insist on more than N hash vals
        if len(hashvals) < min_hashes:
            continue

        # display summary:
        print('cluster {} has {} assignments for {} hashvals / {} bp'.format(
            cluster_n, len(lineages), len(hashvals),
            dblist[0].scaled * len(hashvals)))
        confused_hashvals.update(hashvals)

        tree = lca_utils.build_tree(lineages)
        lca, reason = lca_utils.find_lca(tree)
        if lca:
            rank = lca[-1].rank
        else:
            rank = 'root'
        print('  rank & lca:', rank, lca_utils.display_lineage(lca))

        #        for lineage_n, lineage in enumerate(lineages):
        #            print('* ', lca_utils.display_lineage(lineage))

        # now, identify all members of these lineages by their index:
        all_idxs = []
        for lineage_n, lineage in enumerate(lineages):
            lids = dblist[0].lineage_to_lids[lineage]
            for lid in lids:
                idxs = dblist[0].lid_to_idx[lid]
                all_idxs.extend(idxs)
                for idx in idxs:
                    ident = dblist[0].idx_to_ident[idx]

        # run through and look at all pairs of genomes in these lineages;
        # filter so that we're comparing across lineages with the right
        # LCA, and with significant intersection.
        pair_n = 0
        candidates = []
        for i in range(len(all_idxs)):
            idx1 = all_idxs[i]
            lid1 = dblist[0].idx_to_lid[idx1]
            lin1 = dblist[0].lid_to_lineage[lid1]
            for j in range(i):
                idx2 = all_idxs[j]
                lid2 = dblist[0].idx_to_lid[idx2]
                lin2 = dblist[0].lid_to_lineage[lid2]

                ident1 = dblist[0].idx_to_ident[idx1]
                ident2 = dblist[0].idx_to_ident[idx2]

                debug("{} x {}", ident1, ident2)

                this_tree = lca_utils.build_tree([lin1, lin2])
                this_lca, this_reason = lca_utils.find_lca(this_tree)

                # weed out pairs that don't have the desired intersection
                if lca != this_lca:
                    continue

                mh1 = dblist[0]._signatures[idx1]
                mh2 = dblist[0]._signatures[idx2]

                mins1 = set(mh1.get_mins())
                mins2 = set(mh2.get_mins())
                intersect_size = len(mins1.intersection(mins2))

                # weed out pairs that don't have enough k-mer intersection
                if intersect_size < min_hashes:
                    continue

                candidates.append(
                    (pair_n, ident1, lin1, ident2, lin2, intersect_size))

                # write summary to CSV for find-oddities-examine.py to use
                w.writerow([
                    'cluster{}.{}'.format(cluster_n, pair_n),
                    len(lineages), intersect_size * dblist[0].scaled,
                    dblist[0].ksize, rank,
                    lca_utils.display_lineage(lca), ident1,
                    lca_utils.display_lineage(lin1), ident2,
                    lca_utils.display_lineage(lin2)
                ])

                pair_n += 1

        print('  Candidate genome pairs for these lineages:')
        for pair_n, ident1, lin1, ident2, lin2, intersection_size in candidates:
            print('    cluster.pair {}.{} share {} bases'.format(
                cluster_n, pair_n, intersection_size * dblist[0].scaled))
            print('    - {} ({})'.format(ident1,
                                         lca_utils.display_lineage(lin1)))
            print('    - {} ({})'.format(ident2,
                                         lca_utils.display_lineage(lin2)))
            print('')

        print('')

    return counts, confused_hashvals
Esempio n. 6
0
def test_build_tree_3():                  # empty 'rank2' name
    tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', '')]])
    assert tree == { LineagePair('rank1', 'name1'): {} }
Esempio n. 7
0
def test_build_tree():
    tree = build_tree([[LineagePair('rank1', 'name1'),
                        LineagePair('rank2', 'name2')]])
    assert tree == { LineagePair('rank1', 'name1'):
                         { LineagePair('rank2', 'name2') : {}} }
Esempio n. 8
0
def test_find_lca():
    tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]])
    lca = find_lca(tree)

    assert lca == ((LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2'),), 0)
Esempio n. 9
0
def test_build_tree_5():
    with pytest.raises(ValueError):
        tree = build_tree([])