Beispiel #1
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('lca_db')
    p.add_argument('genome', nargs='+')
    p.add_argument('output')
    p.add_argument('--fragment', default=100000, type=int)
    args = p.parse_args()

    db, ksize, scaled = lca_utils.load_single_database(args.lca_db)
    mh_factory = sourmash.MinHash(n=0, ksize=ksize, scaled=scaled)
    print('**', ksize, scaled)

    n = 0
    m = 0
    sum_bp = 0
    sum_missed_bp = 0

    outfp = open(args.output, 'wt')
    w = csv.writer(outfp)
    w.writerow(['filename', 'contig', 'begin', 'end', 'lca', 'lca_rank'])

    #
    # iterate over all contigs in genome file
    #
    for genome in args.genome:
        for record in screed.open(genome):
            # fragment longer contigs into smaller regions?
            for start in range(0, len(record.sequence), args.fragment):
                seq = record.sequence[start:start + args.fragment]
                n += 1
                sum_bp += len(seq)

                mh = mh_factory.copy_and_clear()
                mh.add_sequence(seq, force=True)
                if not mh:
                    sum_missed_bp += len(seq)
                    continue

                lineage_counts = summarize(mh.get_mins(), [db], 1)

                for k in lineage_counts:
                    lca = lca_utils.display_lineage(k, truncate_empty=False)
                    try:
                        lca_rank = k[-1].rank
                    except IndexError:
                        lca_rank = "none"
                    w.writerow((genome, record.name, start,
                                start + args.fragment, lca, lca_rank))

                m += 1
                min_value = min(mh.get_mins())

    return 0
Beispiel #2
0
    def write_result(self, result, name, length, result_type="search"):
        # write single result
        d = dict(result._asdict())
        d["name"] = name
        d["length"] = length
        d["lineage"] = lca_utils.display_lineage(result.lineage)

        if self.search and result_type == "search":
            self.search_sigs.append(d['match'])
            del d['match']
            self.search_w.writerow(d)
        elif self.search and result_type == "ranksearch":
            self.ranksearch_sigs.append(d['match'])
            del d['match']
            d["match_rank"] = result.lineage[-1].rank
            self.rank_w.writerow(d)
        elif self.gather and result_type == "rankgather":
            d["match_rank"] = result.lineage[-1].rank
            #d["major_bp"] = get_match_bp(float(gr.f_major))
            self.gather_rank_w.writerow(d)
def main(args):
    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.ksize,
                                                     args.scaled)

    # count all the shared kmers across these databases
    counts, total_kmer_count = count_shared_kmers(dblist)

    # write out
    with open(args.csv, 'wt') as fp:
        hashes_by_lineage = csv.writer(fp)
        hashes_by_lineage.writerow(
            ['rank', 'lineage', 'num_shared_kmers', 'percent_shared_kmers'])

        for lineage, shared_kmer_count in counts.items():
            rank = lineage[-1].rank
            percent_shared_kmers = float(shared_kmer_count) / total_kmer_count
            hashes_by_lineage.writerow([
                rank,
                lca_utils.display_lineage(lineage),
                str(shared_kmer_count),
                str(round(percent_shared_kmers, 2))
            ])
Beispiel #4
0
def make_lca_counts(dblist,
                    lowest_rank='phylum',
                    min_num=0,
                    min_hashes=5,
                    prefix='oddities'):
    """
    Collect counts of all the LCAs in the list of databases.
    """
    assert len(dblist) == 1

    keep_ranks = ['root']
    for rank in lca_utils.taxlist():
        keep_ranks.append(rank)
        if rank == lowest_rank:
            break
    print('keeping hashvals at following ranks:', keep_ranks)
    print('min number of lineages:', min_num)
    print('min number of shared hashes:', min_hashes)

    print('---')

    # gather all hashvalue assignments from across all the databases
    assignments = defaultdict(set)
    for lca_db in dblist:
        for hashval, idx_list in lca_db.hashval_to_idx.items():
            if min_num and len(idx_list) < min_num:
                continue

            for idx in idx_list:
                lid = lca_db.idx_to_lid.get(idx)
                if lid is not None:
                    lineage = lca_db.lid_to_lineage[lid]
                    assignments[hashval].add(lineage)

    # now convert to trees -> do LCA & counts
    counts = defaultdict(int)
    mixdict = defaultdict(set)
    for hashval, lineages in assignments.items():

        # for each list of tuple_info [(rank, name), ...] build
        # a tree that lets us discover lowest-common-ancestor.
        debug("{}", lineages)
        tree = lca_utils.build_tree(lineages)

        # now find either a leaf or the first node with multiple
        # children; that's our lowest-common-ancestor node.
        lca, reason = lca_utils.find_lca(tree)

        # find cross-superkingdom hashes, and record combinations of lineages
        # that have them.
        rank = 'root'
        if lca:
            rank = lca[-1].rank

        if rank in keep_ranks:
            xx = []
            for lineage in lineages:
                xx.append(tuple(lineage))
            xx = tuple(xx)

            mixdict[xx].add(hashval)

        counts[lca] += 1

    # sort on number of confused hash vals by combination of lineages.
    mixdict_items = list(mixdict.items())
    mixdict_items.sort(key=lambda x: -len(x[1]))

    confused_hashvals = set()

    fp = open(prefix + '.csv', 'wt')
    w = csv.writer(fp)
    w.writerow([
        'cluster', 'num_lineages', 'shared_kmers', 'ksize', 'rank', 'lca',
        'ident1', 'lineage1', 'ident2', 'lineage2'
    ])

    #
    # find candidate lineages, then evaluate pairwise intersections.
    #

    for cluster_n, (lineages, hashvals) in enumerate(mixdict_items):
        # insist on more than N hash vals
        if len(hashvals) < min_hashes:
            continue

        # display summary:
        print('cluster {} has {} assignments for {} hashvals / {} bp'.format(
            cluster_n, len(lineages), len(hashvals),
            dblist[0].scaled * len(hashvals)))
        confused_hashvals.update(hashvals)

        tree = lca_utils.build_tree(lineages)
        lca, reason = lca_utils.find_lca(tree)
        if lca:
            rank = lca[-1].rank
        else:
            rank = 'root'
        print('  rank & lca:', rank, lca_utils.display_lineage(lca))

        #        for lineage_n, lineage in enumerate(lineages):
        #            print('* ', lca_utils.display_lineage(lineage))

        # now, identify all members of these lineages by their index:
        all_idxs = []
        for lineage_n, lineage in enumerate(lineages):
            lids = dblist[0].lineage_to_lids[lineage]
            for lid in lids:
                idxs = dblist[0].lid_to_idx[lid]
                all_idxs.extend(idxs)
                for idx in idxs:
                    ident = dblist[0].idx_to_ident[idx]

        # run through and look at all pairs of genomes in these lineages;
        # filter so that we're comparing across lineages with the right
        # LCA, and with significant intersection.
        pair_n = 0
        candidates = []
        for i in range(len(all_idxs)):
            idx1 = all_idxs[i]
            lid1 = dblist[0].idx_to_lid[idx1]
            lin1 = dblist[0].lid_to_lineage[lid1]
            for j in range(i):
                idx2 = all_idxs[j]
                lid2 = dblist[0].idx_to_lid[idx2]
                lin2 = dblist[0].lid_to_lineage[lid2]

                ident1 = dblist[0].idx_to_ident[idx1]
                ident2 = dblist[0].idx_to_ident[idx2]

                debug("{} x {}", ident1, ident2)

                this_tree = lca_utils.build_tree([lin1, lin2])
                this_lca, this_reason = lca_utils.find_lca(this_tree)

                # weed out pairs that don't have the desired intersection
                if lca != this_lca:
                    continue

                mh1 = dblist[0]._signatures[idx1]
                mh2 = dblist[0]._signatures[idx2]

                mins1 = set(mh1.get_mins())
                mins2 = set(mh2.get_mins())
                intersect_size = len(mins1.intersection(mins2))

                # weed out pairs that don't have enough k-mer intersection
                if intersect_size < min_hashes:
                    continue

                candidates.append(
                    (pair_n, ident1, lin1, ident2, lin2, intersect_size))

                # write summary to CSV for find-oddities-examine.py to use
                w.writerow([
                    'cluster{}.{}'.format(cluster_n, pair_n),
                    len(lineages), intersect_size * dblist[0].scaled,
                    dblist[0].ksize, rank,
                    lca_utils.display_lineage(lca), ident1,
                    lca_utils.display_lineage(lin1), ident2,
                    lca_utils.display_lineage(lin2)
                ])

                pair_n += 1

        print('  Candidate genome pairs for these lineages:')
        for pair_n, ident1, lin1, ident2, lin2, intersection_size in candidates:
            print('    cluster.pair {}.{} share {} bases'.format(
                cluster_n, pair_n, intersection_size * dblist[0].scaled))
            print('    - {} ({})'.format(ident1,
                                         lca_utils.display_lineage(lin1)))
            print('    - {} ({})'.format(ident2,
                                         lca_utils.display_lineage(lin2)))
            print('')

        print('')

    return counts, confused_hashvals
Beispiel #5
0
def test_display_lineage_2():
    x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ]
    assert display_lineage(x) == "a;;c", display_lineage(x)
Beispiel #6
0
def test_display_lineage_1():
    x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ]
    assert display_lineage(x) == "a;b", display_lineage(x)