Beispiel #1
0
def gather_guess_tax_at_each_rank(
        gather_results,
        num_hashes,
        taxlist=lca_utils.taxlist(include_strain=False),
        minimum_matches=3,
        lowest_rank="genus"):
    rank_results = []
    prev_lineage = ""
    top_lineage = ""
    for rank in taxlist:
        top_lineage, f_ident, f_major = gather_guess_tax_at_rank(
            gather_results, num_hashes, rank, minimum_matches=minimum_matches)

        # summarizing at a lower rank than exists will yield same result as prev. break!
        if not top_lineage or top_lineage == prev_lineage:
            break
        rank_results.append(
            RankSumGatherResult(lineage=top_lineage,
                                f_ident=f_ident,
                                f_major=f_major))
        prev_lineage = top_lineage
        if rank == lowest_rank:
            break

    return rank_results
Beispiel #2
0
def add_hashes_at_ranks(lineage_hashD, hashes_to_add, lineage, match_rank):
    # first add full lineage
    lineage_hashD[lineage].add_many(hashes_to_add)
    for rank in lca_utils.taxlist(include_strain=False):
        # TODO: add check to pop ONLY if needed (no need to pop at genus if lineage only has superk, phyl)
        lin_at_rank = pop_to_rank(lineage, rank)
        lineage_hashD[lin_at_rank].add_many(hashes_to_add)
        if rank == match_rank:
            break
    return lineage_hashD
Beispiel #3
0
def test_searchfiles_contigs_just_gather(location):
    prefix = os.path.join(location, "pref")
    filelist = [
        f"{prefix}.contigs.rankgather.csv", f"{prefix}.contigs.unmatched.fq"
    ]

    sf = SearchFiles(prefix, search=True, gather=True)

    # two minhashes, share ranks at phylum level
    hashval = 12345678
    ident1 = 'first'
    mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c')
    hashval2 = 87654321
    ident2 = 'second'
    mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;f')
    # create lca_db w sigs
    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident1)
    lca_db.insert(sig2, ident=ident2)
    # make lin_db
    lin_db = LineageDB()
    lin_db.insert(ident1, lin1)
    lin_db.insert(ident2, lin2)
    num_hashes = 2
    # search with combined hashvals
    search_mh = make_mh([hashval, hashval2])
    gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class"))

    gather_rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \
                                                      lowest_rank="class", \
                                                      taxlist=lca_utils.taxlist(include_strain=False))

    #write search results
    name = 'name'
    seq_len = 6
    for gres in gather_rank_results:
        sf.write_result(gres, name, seq_len, result_type="rankgather")

    sf.close()

    # check results are in files
    for f in filelist:
        assert os.path.exists(f)

    with open(f"{prefix}.contigs.rankgather.csv", "r") as gatherres:
        this_gather_csvset = get_csv_set(gatherres)
    with open(utils.get_testfile("test-data/test.contigs.rankgather.csv"),
              "r") as searchres:
        saved_gather_csvset = get_csv_set(searchres)
    assert saved_gather_csvset == this_gather_csvset
Beispiel #4
0
def sort_by_rank_and_containment(summarized_results, match_rank):
    sorted_results = []
    # iterate superkingdom --> match_rank
    for rank in lca_utils.taxlist(include_strain=False):
        rank_res = summarized_results[rank]
        rank_res.sort(key=itemgetter(1), reverse=True)  # sort by containment
        for (lin, containment, intersect_bp, match_sig) in rank_res:
            sorted_results.append(
                RankSumSearchResult(lineage=lin,
                                    contained_at_rank=containment,
                                    contained_bp=intersect_bp,
                                    match=match_sig))
        if rank == match_rank:
            break
    return sorted_results
Beispiel #5
0
def test_gather_guess_tax_at_each_rank_1():
    #two minhashes, fully shared ranks

    # first sig
    hashval = 12345678
    ident1 = 'first'
    mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c')

    # second sig
    hashval2 = 87654321
    ident2 = 'second'
    mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;c')

    # create lca_db w sigs
    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident1)
    lca_db.insert(sig2, ident=ident2)

    # make lin_db
    lin_db = LineageDB()
    lin_db.insert(ident1, lin1)
    lin_db.insert(ident2, lin2)

    num_hashes = 2
    superk_lin = lca_utils.make_lineage('a')
    phylum_lin = lca_utils.make_lineage('a;b')

    # search with combined hashvals
    search_mh = make_mh([hashval, hashval2])
    gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class"))
    rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \
                                               lowest_rank="class",
                                               taxlist=lca_utils.taxlist(include_strain=False))

    assert len(rank_results) == 3

    assert rank_results[0] == RankSumGatherResult(lineage=superk_lin,
                                                  f_ident=1.0,
                                                  f_major=1.0)
    assert rank_results[1] == RankSumGatherResult(lineage=phylum_lin,
                                                  f_ident=1.0,
                                                  f_major=1.0)
    assert rank_results[2] == RankSumGatherResult(lineage=lin1,
                                                  f_ident=1.0,
                                                  f_major=1.0)
Beispiel #6
0
def test_gather_guess_tax_at_each_rank_3():
    # two minhashes, totally distinct ranks
    # first sig
    hashval1 = 12345678
    ident1 = 'first'
    mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident1, 'a;b;c')

    # second sig
    hashval2 = 87654321
    ident2 = 'second'
    mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'd;e;f')

    # create lca_db w sig1
    lca_db = LCA_Database(scaled=1, ksize=3)
    lca_db.insert(sig1, ident=ident1)
    lca_db.insert(sig2, ident=ident2)

    # next, make lin_db
    lin_db = LineageDB()
    lin_db.insert(ident1, lin1)
    lin_db.insert(ident2, lin2)

    num_hashes = 2
    #winner seems to be def lineage.. will this remain true always?
    superk_lin = lca_utils.make_lineage('d')
    phylum_lin = lca_utils.make_lineage('d;e')

    # search with combined hashvals
    search_mh = make_mh([hashval1, hashval2])
    gather_results = list(gather_at_rank(search_mh, lca_db, lin_db, "class"))
    rank_results=gather_guess_tax_at_each_rank(gather_results, num_hashes, minimum_matches=1, \
                                               lowest_rank="class",
                                               taxlist=lca_utils.taxlist(include_strain=False))
    assert len(rank_results) == 3

    assert rank_results[0] == RankSumGatherResult(lineage=superk_lin,
                                                  f_ident=1.0,
                                                  f_major=0.5)
    assert rank_results[1] == RankSumGatherResult(lineage=phylum_lin,
                                                  f_ident=1.0,
                                                  f_major=0.5)
    assert rank_results[2] == RankSumGatherResult(lineage=lin2,
                                                  f_ident=1.0,
                                                  f_major=0.5)
def pop_to_rank(lin, rank):
    "Remove lineage tuples from given lineage `lin` until `rank` is reached."
    lin = list(lin)

    txl = lca_utils.taxlist()
    before_rank = []
    for txl_rank in txl:
        if txl_rank != rank:
            before_rank.append(txl_rank)
        else:
            break

    # are we already above rank?
    if lin and lin[-1].rank in before_rank:
        return tuple(lin)

    while lin and lin[-1].rank != rank:
        lin.pop()

    return tuple(lin)
Beispiel #8
0
def make_lca_counts(dblist,
                    lowest_rank='phylum',
                    min_num=0,
                    min_hashes=5,
                    prefix='oddities'):
    """
    Collect counts of all the LCAs in the list of databases.
    """
    assert len(dblist) == 1

    keep_ranks = ['root']
    for rank in lca_utils.taxlist():
        keep_ranks.append(rank)
        if rank == lowest_rank:
            break
    print('keeping hashvals at following ranks:', keep_ranks)
    print('min number of lineages:', min_num)
    print('min number of shared hashes:', min_hashes)

    print('---')

    # gather all hashvalue assignments from across all the databases
    assignments = defaultdict(set)
    for lca_db in dblist:
        for hashval, idx_list in lca_db.hashval_to_idx.items():
            if min_num and len(idx_list) < min_num:
                continue

            for idx in idx_list:
                lid = lca_db.idx_to_lid.get(idx)
                if lid is not None:
                    lineage = lca_db.lid_to_lineage[lid]
                    assignments[hashval].add(lineage)

    # now convert to trees -> do LCA & counts
    counts = defaultdict(int)
    mixdict = defaultdict(set)
    for hashval, lineages in assignments.items():

        # for each list of tuple_info [(rank, name), ...] build
        # a tree that lets us discover lowest-common-ancestor.
        debug("{}", lineages)
        tree = lca_utils.build_tree(lineages)

        # now find either a leaf or the first node with multiple
        # children; that's our lowest-common-ancestor node.
        lca, reason = lca_utils.find_lca(tree)

        # find cross-superkingdom hashes, and record combinations of lineages
        # that have them.
        rank = 'root'
        if lca:
            rank = lca[-1].rank

        if rank in keep_ranks:
            xx = []
            for lineage in lineages:
                xx.append(tuple(lineage))
            xx = tuple(xx)

            mixdict[xx].add(hashval)

        counts[lca] += 1

    # sort on number of confused hash vals by combination of lineages.
    mixdict_items = list(mixdict.items())
    mixdict_items.sort(key=lambda x: -len(x[1]))

    confused_hashvals = set()

    fp = open(prefix + '.csv', 'wt')
    w = csv.writer(fp)
    w.writerow([
        'cluster', 'num_lineages', 'shared_kmers', 'ksize', 'rank', 'lca',
        'ident1', 'lineage1', 'ident2', 'lineage2'
    ])

    #
    # find candidate lineages, then evaluate pairwise intersections.
    #

    for cluster_n, (lineages, hashvals) in enumerate(mixdict_items):
        # insist on more than N hash vals
        if len(hashvals) < min_hashes:
            continue

        # display summary:
        print('cluster {} has {} assignments for {} hashvals / {} bp'.format(
            cluster_n, len(lineages), len(hashvals),
            dblist[0].scaled * len(hashvals)))
        confused_hashvals.update(hashvals)

        tree = lca_utils.build_tree(lineages)
        lca, reason = lca_utils.find_lca(tree)
        if lca:
            rank = lca[-1].rank
        else:
            rank = 'root'
        print('  rank & lca:', rank, lca_utils.display_lineage(lca))

        #        for lineage_n, lineage in enumerate(lineages):
        #            print('* ', lca_utils.display_lineage(lineage))

        # now, identify all members of these lineages by their index:
        all_idxs = []
        for lineage_n, lineage in enumerate(lineages):
            lids = dblist[0].lineage_to_lids[lineage]
            for lid in lids:
                idxs = dblist[0].lid_to_idx[lid]
                all_idxs.extend(idxs)
                for idx in idxs:
                    ident = dblist[0].idx_to_ident[idx]

        # run through and look at all pairs of genomes in these lineages;
        # filter so that we're comparing across lineages with the right
        # LCA, and with significant intersection.
        pair_n = 0
        candidates = []
        for i in range(len(all_idxs)):
            idx1 = all_idxs[i]
            lid1 = dblist[0].idx_to_lid[idx1]
            lin1 = dblist[0].lid_to_lineage[lid1]
            for j in range(i):
                idx2 = all_idxs[j]
                lid2 = dblist[0].idx_to_lid[idx2]
                lin2 = dblist[0].lid_to_lineage[lid2]

                ident1 = dblist[0].idx_to_ident[idx1]
                ident2 = dblist[0].idx_to_ident[idx2]

                debug("{} x {}", ident1, ident2)

                this_tree = lca_utils.build_tree([lin1, lin2])
                this_lca, this_reason = lca_utils.find_lca(this_tree)

                # weed out pairs that don't have the desired intersection
                if lca != this_lca:
                    continue

                mh1 = dblist[0]._signatures[idx1]
                mh2 = dblist[0]._signatures[idx2]

                mins1 = set(mh1.get_mins())
                mins2 = set(mh2.get_mins())
                intersect_size = len(mins1.intersection(mins2))

                # weed out pairs that don't have enough k-mer intersection
                if intersect_size < min_hashes:
                    continue

                candidates.append(
                    (pair_n, ident1, lin1, ident2, lin2, intersect_size))

                # write summary to CSV for find-oddities-examine.py to use
                w.writerow([
                    'cluster{}.{}'.format(cluster_n, pair_n),
                    len(lineages), intersect_size * dblist[0].scaled,
                    dblist[0].ksize, rank,
                    lca_utils.display_lineage(lca), ident1,
                    lca_utils.display_lineage(lin1), ident2,
                    lca_utils.display_lineage(lin2)
                ])

                pair_n += 1

        print('  Candidate genome pairs for these lineages:')
        for pair_n, ident1, lin1, ident2, lin2, intersection_size in candidates:
            print('    cluster.pair {}.{} share {} bases'.format(
                cluster_n, pair_n, intersection_size * dblist[0].scaled))
            print('    - {} ({})'.format(ident1,
                                         lca_utils.display_lineage(lin1)))
            print('    - {} ({})'.format(ident2,
                                         lca_utils.display_lineage(lin2)))
            print('')

        print('')

    return counts, confused_hashvals
Beispiel #9
0
def test_taxlist_2():
    assert list(taxlist(include_strain=False)) == ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
Beispiel #10
0
def test_taxlist_1():
    assert list(taxlist()) == ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']
Beispiel #11
0
def main(args):
    "Main entry point for scripting. Use cmdline for command line entry."
    genomebase = os.path.basename(args.genome)
    match_rank = 'genus'

    # load taxonomy CSV
    tax_assign, _ = load_taxonomy_assignments(args.lineages_csv,
                                              start_column=2)
    print(f'loaded {len(tax_assign)} tax assignments.')

    # load the genome signature
    genome_sig = sourmash.load_one_signature(args.genome_sig,
                                             select_moltype=args.alphabet,
                                             ksize=args.ksize)

    # load all of the matches from search --containment in the database
    with open(args.matches_sig, 'rt') as fp:
        try:
            siglist = list(
                sourmash.load_signatures(fp, do_raise=True, quiet=False))
        except sourmash.exceptions.SourmashError:
            siglist = []
    print(f"loaded {len(siglist)} matches from '{args.matches_sig}'")

    # Hack for examining members of our search database: remove exact matches.
    new_siglist = []
    for ss in siglist:
        if genome_sig.similarity(ss) == 1.0:
            print(f'removing an identical match: {ss.name()}')
        else:
            new_siglist.append(ss)
    siglist = new_siglist

    if not siglist:
        # write empty files so snakemake workflows don't complain; exit.
        print('no non-identical matches for this genome, exiting.')
        if not args.no_search_contigs:
            sf = SearchFiles(args.output_prefix,
                             not args.no_search,
                             args.gather,
                             contigs=True)
            sf.close()
        if args.search_genome:
            gf = SearchFiles(args.output_prefix,
                             not args.no_search,
                             args.gather,
                             contigs=False)
            gf.close()
        return 0

    # construct a template minhash object that we can use to create new 'uns
    empty_mh = siglist[0].minhash.copy_and_clear()
    ksize = empty_mh.ksize
    scaled = empty_mh.scaled
    moltype = empty_mh.moltype

    # create empty LCA database to populate...
    lca_db = LCA_Database(ksize=ksize, scaled=scaled, moltype=moltype)
    lin_db = LineageDB()

    # ...with specific matches.
    for ss in siglist:
        ident = get_ident(ss)
        lineage = tax_assign[ident]

        lca_db.insert(ss, ident=ident)
        lin_db.insert(ident, lineage)

    print(f'loaded {len(siglist)} signatures & created LCA Database')
    print('')
    print(f'reading contigs from {genomebase}')

    screed_iter = screed.open(args.genome)
    genome_len = 0

    if not args.no_search_contigs:
        sf = SearchFiles(args.output_prefix,
                         not args.no_search,
                         args.gather,
                         contigs=True)

        for n, record in enumerate(screed_iter):
            # look at each contig individually
            mh = empty_mh.copy_and_clear()
            mh.add_sequence(record.sequence, force=True)
            # search, optionally aggregate matched hashes to get containment at rank

            seq_len = len(record.sequence)
            genome_len += seq_len
            num_hashes = len(mh.hashes)

            if not args.no_search:
                search_results, search_rank_results = search_containment_at_rank(
                    mh, lca_db, lin_db, match_rank)

                if not search_results:
                    # write to unclassified
                    sf.unmatched.write(">" + record.name + "\n" +
                                       record.sequence + "\n")
                    continue  # if no search results, don't bother with gather
                else:
                    # first, print normal search --containment results
                    for sr in search_results:
                        sf.write_result(sr,
                                        record.name,
                                        seq_len,
                                        result_type="search")
                    # now, print containment at rank results
                    for sr in search_rank_results:
                        sf.write_result(sr,
                                        record.name,
                                        seq_len,
                                        result_type="ranksearch")

            if args.gather:
                # first, gather at match rank (default genus)
                gather_results = list(
                    gather_at_rank(mh, lca_db, lin_db, match_rank))
                # write standard gather_results?

                if not gather_results:
                    # write to unclassified. should only get here if no search OR gather results
                    sf.unmatched.write(">" + record.name + "\n" +
                                       record.sequence + "\n")
                else:
                    # next, summarize at higher ranks
                    gather_taxonomy_per_rank = gather_guess_tax_at_each_rank(gather_results, num_hashes, \
                                                                             minimum_matches=args.gather_min_matches, \

            lowest_rank=match_rank, \
                                                                             taxlist=lca_utils.taxlist(include_strain=False))
                    #results = list of RankSumGatherResult = namedtuple('RankSumGatherResult', 'lineage, f_ident, f_major')

                    # write taxonomy out
                    for gr in gather_taxonomy_per_rank:
                        sf.write_result(gr,
                                        record.name,
                                        seq_len,
                                        result_type="rankgather")

        print(f"Processed {n+1} contigs.")
        # close contig files
        sf.close()

    if args.search_genome:
        gf = SearchFiles(args.output_prefix,
                         not args.no_search,
                         args.gather,
                         contigs=False)
        # MAG workflow
        entire_mh = genome_sig.minhash
        genome_name = genome_sig.name()
        num_hashes = len(entire_mh.hashes)
        if not genome_len:
            for record in screed_iter:
                genome_len += len(record.sequence)
        if not args.no_search:
            #results are guaranteed, otherwise would have exited before searching
            search_results, search_rank_results = search_containment_at_rank(
                entire_mh, lca_db, lin_db, match_rank)
            for sr in search_results:
                gf.write_result(sr,
                                genome_name,
                                genome_len,
                                result_type="search")
            for sr in search_rank_results:
                gf.write_result(sr,
                                genome_name,
                                genome_len,
                                result_type="ranksearch")
        if args.gather:
            gather_results = list(
                gather_at_rank(entire_mh, lca_db, lin_db, match_rank))
            # next, summarize at higher ranks
            gather_taxonomy_per_rank = gather_guess_tax_at_each_rank(gather_results, num_hashes, \
                                                                     minimum_matches=args.gather_min_matches, \
                                                                     lowest_rank=match_rank, \
                                                                     taxlist=lca_utils.taxlist(include_strain=False))
            for gather_res in gather_taxonomy_per_rank:
                gf.write_result(gather_res,
                                genome_name,
                                genome_len,
                                result_type="rankgather")
        # close genome files
        gf.close()

    return 0