def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('sigs', nargs='+')
    parser.add_argument('lca_db')
    args = parser.parse_args()

    minhashes = []
    for filename in args.sigs:
        ss = sourmash.load_one_signature(filename)
        minhashes.append(ss.minhash)

    # load the LCA database
    dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None)
    db = dblist[0]

    # double check scaled requirements
    some_mh = minhashes[0]
    mh_scaled = some_mh.scaled
    if scaled >= mh_scaled:
        print(
            '** warning: many minhashes will go unclassified because LCA database scaled is {}'
            .format(scaled),
            file=sys.stderr)
        print('** warning: the minhash scaled is {}'.format(mh_scaled),
              file=sys.stderr)

    summarize_taxonomic_purity(minhashes,
                               db,
                               verbose=True,
                               filenames=args.sigs)
Exemple #2
0
def test_databases():
    filename1 = utils.get_test_data('lca/delmont-1.lca.json')
    filename2 = utils.get_test_data('lca/delmont-2.lca.json')
    dblist, ksize, scaled = lca_utils.load_databases([filename1, filename2])

    print(dblist)

    assert len(dblist) == 2
    assert ksize == 31
    assert scaled == 10000
Exemple #3
0
def main(args):
    p = argparse.ArgumentParser()
    p.add_argument('db', nargs='+')
    p.add_argument('--scaled', type=float)
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-d',
                   '--debug',
                   action='store_true',
                   help='output debugging output')
    p.add_argument(
        '--minimum-num',
        type=int,
        default=0,
        help=
        'Minimum number of different lineages a k-mer must be in to be counted'
    )
    p.add_argument(
        '--minimum-hashes',
        type=int,
        default=5,
        help='Minimum number of hashes lineages must share to be reported')
    p.add_argument('--lowest-rank', default='phylum')
    p.add_argument('--prefix', default=None, help='prefix for output files')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    set_quiet(args.quiet, args.debug)

    if args.scaled:
        args.scaled = int(args.scaled)

    # load all the databases
    print('loading databases:', args.db)
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
    assert len(dblist) == 1

    # count all the LCAs across these databases
    counts, confused_hashvals = make_lca_counts(dblist,
                                                lowest_rank=args.lowest_rank,
                                                min_num=args.minimum_num,
                                                min_hashes=args.minimum_hashes,
                                                prefix=args.prefix)

    with open('confused_hashvals.txt', 'wt') as fp:
        fp.write("\n".join([str(i) for i in confused_hashvals]))
Exemple #4
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('filename')
    parser.add_argument('lca_db')
    args = parser.parse_args()

    # load the minhashes calculated by search.characterize_catlas_regions
    group_ident = pickle.load(open(args.filename + '.node_mh', 'rb'))

    # load the LCA database
    dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None)
    db = dblist[0]

    # double check scaled requirements
    some_mh = next(iter(group_ident.values()))
    mh_scaled = some_mh.scaled
    if scaled > mh_scaled:
        print('** warning: many minhashes will go unclassified because LCA database scaled is {}'.format(scaled), file=sys.stderr)
        print('** warning: the minhash scaled is {}'.format(mh_scaled), file=sys.stderr)

    summarize_taxonomic_purity(group_ident.values(), db, verbose=True)
def main(args):
    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.ksize,
                                                     args.scaled)

    # count all the shared kmers across these databases
    counts, total_kmer_count = count_shared_kmers(dblist)

    # write out
    with open(args.csv, 'wt') as fp:
        hashes_by_lineage = csv.writer(fp)
        hashes_by_lineage.writerow(
            ['rank', 'lineage', 'num_shared_kmers', 'percent_shared_kmers'])

        for lineage, shared_kmer_count in counts.items():
            rank = lineage[-1].rank
            percent_shared_kmers = float(shared_kmer_count) / total_kmer_count
            hashes_by_lineage.writerow([
                rank,
                lca_utils.display_lineage(lineage),
                str(shared_kmer_count),
                str(round(percent_shared_kmers, 2))
            ])