Esempio n. 1
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('filename')
    parser.add_argument('lca_db')
    args = parser.parse_args()

    # load the minhashes calculated by search.characterize_catlas_regions
    group_ident = pickle.load(open(args.filename + '.node_mh', 'rb'))

    # load the LCA database
    dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None)
    db = dblist[0]

    # double check scaled requirements
    some_mh = next(iter(group_ident.values()))
    mh_scaled = some_mh.scaled
    if scaled > mh_scaled:
        print(
            '** warning: many minhashes will go unclassified because LCA database scaled is {}'
            .format(scaled),
            file=sys.stderr)
        print('** warning: the minhash scaled is {}'.format(mh_scaled),
              file=sys.stderr)

    summarize_taxonomic_purity(group_ident.values(), db, verbose=True)
Esempio n. 2
0
def test_databases():
    filename1 = utils.get_test_data('lca/delmont-1.lca.json')
    filename2 = utils.get_test_data('lca/delmont-2.lca.json')
    dblist, ksize, scaled = lca_utils.load_databases([filename1, filename2])

    print(dblist)

    assert len(dblist) == 2
    assert ksize == 31
    assert scaled == 10000
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('filename')
    parser.add_argument('lca_db')
    args = parser.parse_args()

    # load the minhashes calculated by search.characterize_catlas_regions
    group_ident = pickle.load(open(args.filename + '.node_mh', 'rb'))

    # load the LCA database
    dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None)
    db = dblist[0]

    # double check scaled requirements
    some_mh = next(iter(group_ident.values()))
    mh_scaled = some_mh.scaled
    if scaled > mh_scaled:
        print('** warning: many minhashes will go unclassified because LCA database scaled is {}'.format(scaled), file=sys.stderr)
        print('** warning: the minhash scaled is {}'.format(mh_scaled), file=sys.stderr)

    summarize_taxonomic_purity(group_ident.values(), db, verbose=True)
Esempio n. 4
0
def rankinfo_main(args):
    """
    rankinfo!
    """
    p = argparse.ArgumentParser(prog="sourmash lca rankinfo")
    p.add_argument('db', nargs='+')
    p.add_argument('--scaled', type=float)
    p.add_argument('-d', '--debug', action='store_true')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    if args.scaled:
        args.scaled = int(args.scaled)

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)

    # count all the LCAs across these databases
    counts = make_lca_counts(dblist)

    # collect counts across all ranks
    counts_by_rank = defaultdict(int)
    for lineage, count in counts.items():
        if lineage:
            lineage_tup = lineage[-1]
            counts_by_rank[lineage_tup.rank] += count

    # output!
    total = float(sum(counts_by_rank.values()))
    for rank in lca_utils.taxlist():
        count = counts_by_rank.get(rank, 0)
        print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.))
Esempio n. 5
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('sigs', nargs='+')
    parser.add_argument('lca_db')
    args = parser.parse_args()

    minhashes = []
    for filename in args.sigs:
        ss = sourmash_lib.load_one_signature(filename)
        minhashes.append(ss.minhash)

    # load the LCA database
    dblist, ksize, scaled = lca_utils.load_databases([args.lca_db], None)
    db = dblist[0]

    # double check scaled requirements
    some_mh = minhashes[0]
    mh_scaled = some_mh.scaled
    if scaled >= mh_scaled:
        print('** warning: many minhashes will go unclassified because LCA database scaled is {}'.format(scaled), file=sys.stderr)
        print('** warning: the minhash scaled is {}'.format(mh_scaled), file=sys.stderr)

    summarize_taxonomic_purity(minhashes, db, verbose=True, filenames=args.sigs)
Esempio n. 6
0
def gather_main(args):
    """
    Do a greedy search for the hash components of a query against an LCA db.

    Here we don't actually do a least-common-ancestor search of any kind; we
    do essentially the same kind of search as we do in `sourmash gather`, with
    the main difference that we are implicitly combining different genomes of
    identical lineages.

    This takes advantage of the structure of the LCA db, where we store the
    full lineage information for each known hash, as opposed to storing only
    the least-common-ancestor information for it.
    """
    p = argparse.ArgumentParser(prog="sourmash lca gather")
    p.add_argument('query')
    p.add_argument('db', nargs='+')
    p.add_argument('-d', '--debug', action='store_true')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   help='output CSV containing matches to this file')
    p.add_argument('--output-unassigned', type=argparse.FileType('wt'),
                   help='output unassigned portions of the query as a signature to this file')
    p.add_argument('--ignore-abundance',  action='store_true',
                   help='do NOT use k-mer abundances if present')
    args = p.parse_args(args)

    if args.debug:
        set_debug(args.debug)

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, None)

    # for each query, gather all the matches across databases
    query_sig = sourmash_args.load_query_signature(args.query, ksize, 'DNA')
    debug('classifying', query_sig.name())

    # make sure we're looking at the same scaled value as database
    query_sig.minhash = query_sig.minhash.downsample_scaled(scaled)

    # do the classification, output results
    found = []
    for result, f_unassigned, est_bp, remaining_mins in gather_signature(query_sig, dblist, args.ignore_abundance):
        # is this our first time through the loop? print headers, if so.
        if not len(found):
            print_results("")
            print_results("overlap     p_query p_match ")
            print_results("---------   ------- --------")

        # output!
        pct_query = '{:.1f}%'.format(result.f_unique_to_query*100)
        pct_match = '{:.1f}%'.format(result.f_match*100)
        str_bp = format_bp(result.intersect_bp)
        name = format_lineage(result.lineage)

        equal_match_str = ""
        if result.n_equal_matches:
            equal_match_str = " (** {} equal matches)".format(result.n_equal_matches)

        print_results('{:9}   {:>6}  {:>6}      {}{}', str_bp, pct_query,
                      pct_match, name, equal_match_str)

        found.append(result)

    if found:
        print_results('')
        if f_unassigned:
            print_results('{:.1f}% ({}) of hashes have no assignment.', f_unassigned*100,
                          format_bp(est_bp))
        else:
            print_results('Query is completely assigned.')
            print_results('')
    # nothing found.
    else:
        est_bp = len(query_sig.minhash.get_mins()) * query_sig.minhash.scaled
        print_results('')
        print_results('No assignment for est {} of sequence.',
                      format_bp(est_bp))
        print_results('')

    if not found:
        sys.exit(0)

    if args.output:
        fieldnames = ['intersect_bp', 'f_match', 'f_unique_to_query', 'f_unique_weighted',
                      'average_abund', 'name', 'n_equal_matches'] + list(lca_utils.taxlist())

        w = csv.DictWriter(args.output, fieldnames=fieldnames)
        w.writeheader()
        for result in found:
            lineage = result.lineage
            d = dict(result._asdict())
            del d['lineage']

            for (rank, value) in lineage:
                d[rank] = value

            w.writerow(d)

    if args.output_unassigned:
        if not found:
            notify('nothing found - entire query signature unassigned.')
        elif not remaining_mins:
            notify('no unassigned hashes! not saving.')
        else:
            outname = args.output_unassigned.name
            notify('saving unassigned hashes to "{}"', outname)

            e = query_sig.minhash.copy_and_clear()
            e.add_many(remaining_mins)

            sourmash_lib.save_signatures([ sourmash_lib.SourmashSignature(e) ],
                                         args.output_unassigned)
Esempio n. 7
0
def classify(args):
    """
    main single-genome classification function.
    """
    p = argparse.ArgumentParser()
    p.add_argument('--db', nargs='+', action='append')
    p.add_argument('--query', nargs='+', action='append')
    p.add_argument('--threshold', type=int, default=DEFAULT_THRESHOLD)
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   help='output CSV to this file instead of stdout')
    p.add_argument('--scaled', type=float)
    p.add_argument('--traverse-directory',
                   action='store_true',
                   help='load all signatures underneath directories.')
    p.add_argument('-d', '--debug', action='store_true')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    if not args.query:
        error('Error! must specify at least one query signature with --query')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    # flatten --db and --query
    args.db = [item for sublist in args.db for item in sublist]
    args.query = [item for sublist in args.query for item in sublist]

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
    notify('ksize={} scaled={}', ksize, scaled)

    # find all the queries
    notify('finding query signatures...')
    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.query))
    else:
        inp_files = list(args.query)

    # set up output
    csvfp = csv.writer(sys.stdout)
    if args.output:
        notify("outputting classifications to '{}'", args.output.name)
        csvfp = csv.writer(args.output)
    else:
        notify("outputting classifications to stdout")
    csvfp.writerow(['ID', 'status'] + list(lca_utils.taxlist()))

    # for each query, gather all the matches across databases
    total_count = 0
    n = 0
    total_n = len(inp_files)
    for query_filename in inp_files:
        n += 1
        for query_sig in sourmash_lib.load_signatures(query_filename,
                                                      ksize=ksize):
            notify(u'\r\033[K', end=u'')
            notify('... classifying {} (file {} of {})',
                   query_sig.name(),
                   n,
                   total_n,
                   end='\r')
            debug('classifying', query_sig.name())
            total_count += 1

            # make sure we're looking at the same scaled value as database
            query_sig.minhash = query_sig.minhash.downsample_scaled(scaled)

            # do the classification
            lineage, status = classify_signature(query_sig, dblist,
                                                 args.threshold)
            debug(lineage)

            # output each classification to the spreadsheet
            row = [query_sig.name(), status]
            row += lca_utils.zip_lineage(lineage)

            # when outputting to stdout, make output intelligible
            if not args.output:
                notify(u'\r\033[K', end=u'')
            csvfp.writerow(row)

    notify(u'\r\033[K', end=u'')
    notify('classified {} signatures total', total_count)
Esempio n. 8
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    p.add_argument('--sample-threshold',
                   default=DEFAULT_SAMPLE_THRESHOLD,
                   type=int)
    p.add_argument('--abundance-threshold',
                   default=DEFAULT_ABUND_THRESHOLD,
                   type=int)
    p.add_argument('revindex')
    p.add_argument('db', nargs='+')
    args = p.parse_args()

    idx = revindex_utils.HashvalRevindex(args.revindex)

    lca_db_list, ksize, scaled = lca_utils.load_databases(args.db, SCALED)

    cnt = collections.Counter()
    for k, v in idx.hashval_to_abunds.items():
        cnt[k] += len([abund for abund in v \
                       if abund >= args.abundance_threshold])

    total = 0
    found = 0
    unknown = collections.defaultdict(int)
    for hashval, count in cnt.most_common():
        # break when we hit things in < 10 samples.
        if count < args.sample_threshold:
            break
        total += 1
        lca_set = set()

        for lca_db in lca_db_list:
            lineages = lca_db.get_lineage_assignments(hashval)
            lca_set.update(lineages)

        if not lca_set:
            unknown[count] += 1
            continue

        assert lca_set, lca_set

        # for each list of tuple_info [(rank, name), ...] build
        # a tree that lets us discover lowest-common-ancestor.
        tree = lca_utils.build_tree(lca_set)

        # now find either a leaf or the first node with multiple
        # children; that's our lowest-common-ancestor node.
        lca, reason = lca_utils.find_lca(tree)

        print('hash {}, in {} samples; lineage: {}'.format(
            hashval, count, ";".join(lca_utils.zip_lineage(lca))),
              file=sys.stderr)
        found += 1

    print('found {} of {} ({:.2f}%)'.format(found, total, found / total * 100),
          file=sys.stderr)
    print('outputting distribution of unknowns', file=sys.stderr)
    print('commonality,n,sum_n')

    sofar = 0
    for k, cnt in sorted(unknown.items()):
        sofar += cnt
        print('{},{},{}'.format(k, cnt, sofar))
Esempio n. 9
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('sigs', nargs='+')
    p.add_argument('--traverse-directory',
                   action='store_true',
                   help='load all signatures underneath directories.')
    p.add_argument('-k', '--ksize', default=31, type=int)
    p.add_argument('-d', '--debug', action='store_true')
    p.add_argument('-f', '--force', action='store_true')
    p.add_argument('--scaled', type=float, default=10000)
    p.add_argument('--plot', default=None)
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   help='CSV output')
    p.add_argument('--step', type=int, default=1000)
    p.add_argument('--repeat', type=int, default=5)
    p.add_argument('--db', nargs='+', action='append')
    args = p.parse_args()

    if args.debug:
        set_debug(args.debug)

    args.scaled = int(args.scaled)

    dblist = []
    known_hashes = set()
    if args.db:
        args.db = [item for sublist in args.db for item in sublist]
        dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
        assert ksize == args.ksize
        notify('loaded {} LCA databases', len(dblist))

        for db in dblist:
            known_hashes.update(db.hashval_to_lineage_id.keys())
        notify('got {} known hashes!', len(known_hashes))

    notify('finding signatures...')
    if args.traverse_directory:
        yield_all_files = False  # only pick up *.sig files?
        if args.force:
            yield_all_files = True
        inp_files = list(
            sourmash_args.traverse_find_sigs(args.sigs,
                                             yield_all_files=yield_all_files))
    else:
        inp_files = list(args.sigs)

    n = 0
    total_n = len(inp_files)
    sigs = []
    total_hashvals = list()
    for filename in inp_files:
        n += 1
        for sig in sourmash_lib.load_signatures(filename, ksize=args.ksize):
            notify(u'\r\033[K', end=u'')
            notify('... loading signature {} (file {} of {})',
                   sig.name()[:30],
                   n,
                   total_n,
                   end='\r')
            debug(filename, sig.name())

            sig.minhash = sig.minhash.downsample_scaled(args.scaled)

            total_hashvals.extend(sig.minhash.get_mins())
            sigs.append(sig)

    notify(u'\r\033[K', end=u'')
    notify('...found {} signatures total in {} files.', len(sigs), total_n)

    distinct_hashvals = set(total_hashvals)
    notify('{} distinct out of {} total hashvals.', (len(distinct_hashvals)),
           len(total_hashvals))
    if known_hashes:
        n_known = len(known_hashes.intersection(distinct_hashvals))
        notify('{} of them known, or {:.1f}%', n_known,
               n_known / float(len(distinct_hashvals)) * 100)

    x = []
    y = []
    z = []
    notify('subsampling...')
    for n in range(0, len(total_hashvals), args.step):
        notify(u'\r\033[K', end=u'')
        notify('... {} of {}', n, len(total_hashvals), end='\r')
        avg = 0
        known = 0
        for j in range(0, args.repeat):
            subsample = random.sample(total_hashvals, n)
            distinct = len(set(subsample))
            if known_hashes:
                known += len(set(subsample).intersection(known_hashes))
            avg += distinct

        x.append(n)
        y.append(avg / args.repeat)
        z.append(known / args.repeat)

    notify('\n...done!')

    if args.output:
        w = csv.writer(args.output)
        w.writerow(['n', 'k', 'known'])
        for a, b, c in zip(x, y, z):
            w.writerow([a, b, c])

    if args.plot:
        from matplotlib import pyplot
        pyplot.plot(x, y)
        pyplot.savefig(args.plot)
Esempio n. 10
0
def summarize_main(args):
    """
    main summarization function.
    """
    p = argparse.ArgumentParser()
    p.add_argument('--db', nargs='+', action='append')
    p.add_argument('--query', nargs='+', action='append')
    p.add_argument('--threshold', type=int, default=DEFAULT_THRESHOLD)
    p.add_argument('--traverse-directory',
                   action='store_true',
                   help='load all signatures underneath directories.')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   help='CSV output')
    p.add_argument('--scaled', type=float)
    p.add_argument('-d', '--debug', action='store_true')
    args = p.parse_args(args)

    if not args.db:
        error('Error! must specify at least one LCA database with --db')
        sys.exit(-1)

    if not args.query:
        error('Error! must specify at least one query signature with --query')
        sys.exit(-1)

    if args.debug:
        set_debug(args.debug)

    if args.scaled:
        args.scaled = int(args.scaled)

    # flatten --db and --query
    args.db = [item for sublist in args.db for item in sublist]
    args.query = [item for sublist in args.query for item in sublist]

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled)
    notify('ksize={} scaled={}', ksize, scaled)

    # find all the queries
    notify('finding query signatures...')
    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.query))
    else:
        inp_files = list(args.query)

    # for each query, gather all the hashvals across databases
    total_count = 0
    n = 0
    total_n = len(inp_files)
    hashvals = defaultdict(int)
    for query_filename in inp_files:
        n += 1
        for query_sig in sourmash_lib.load_signatures(query_filename,
                                                      ksize=ksize):
            notify(u'\r\033[K', end=u'')
            notify('... loading {} (file {} of {})',
                   query_sig.name(),
                   n,
                   total_n,
                   end='\r')
            total_count += 1

            mh = query_sig.minhash.downsample_scaled(scaled)
            for hashval in mh.get_mins():
                hashvals[hashval] += 1

    notify(u'\r\033[K', end=u'')
    notify('loaded {} signatures from {} files total.', total_count, n)

    # get the full counted list of lineage counts in this signature
    lineage_counts = summarize(hashvals, dblist, args.threshold)

    # output!
    total = float(len(hashvals))
    for (lineage, count) in lineage_counts.items():
        if lineage:
            lineage = lca_utils.zip_lineage(lineage, truncate_empty=True)
            lineage = ';'.join(lineage)
        else:
            lineage = '(root)'

        p = count / total * 100.
        p = '{:.1f}%'.format(p)

        print_results('{:5} {:>5}   {}'.format(p, count, lineage))

    # CSV:
    if args.output:
        w = csv.writer(args.output)
        headers = ['count'] + list(lca_utils.taxlist())
        w.writerow(headers)

        for (lineage, count) in lineage_counts.items():
            debug('lineage:', lineage)
            row = [count] + lca_utils.zip_lineage(lineage)
            w.writerow(row)