Example #1
0
def make_all_matches(sigdict, tree, threshold):
    """
    Find all the matches between a dictionary of signatures and an
    SBT (search tree), at or above given threshold.

    Return a dictionary of d[signame] -> (match_name, similarity)
    """
    match_d = {}
    search_fn = lambda: SearchMinHashesFindBest().search

    for query in sigdict.values():
        matching_sig = None
        for leaf in tree.find(search_fn(), query, threshold):

            # deal with bug? in this search_fn; thresholds not always met.
            similarity = leaf.data.similarity(query)
            if similarity >= threshold:
                matching_sig = leaf.data
                print('match:', query.name(), matching_sig.name(), similarity)
                match_d[query.name()] = (matching_sig.name(), similarity)

        if not matching_sig:
            print('no match found:', query.name())

    return match_d
Example #2
0
    def do_search():
        search_fn = SearchMinHashesFindBest().search

        results = []
        for leaf in tree.find(search_fn, streamsig, args.threshold):
            results.append((streamsig.similarity(leaf.data), leaf.data))

        return results
Example #3
0
def sbt_search(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to load')
    parser.add_argument('query', help='signature to query')
    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
    parser.add_argument('--threshold', default=0.08, type=float)
    parser.add_argument('--save-matches', type=argparse.FileType('wt'))
    parser.add_argument('--best-only', action='store_true')

    sourmash_args.add_moltype_args(parser)
    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    search_fn = search_minhashes
    if args.best_only:
        search_fn = SearchMinHashesFindBest().search

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
    query = sourmash_args.load_query_signature(args.query,
                                               select_ksize=args.ksize,
                                               select_moltype=moltype)
    query_moltype = sourmash_args.get_moltype(query)
    query_ksize = query.estimator.ksize
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query_ksize, query_moltype)

    results = []
    for leaf in tree.find(search_fn, query, args.threshold):
        results.append((query.similarity(leaf.data), leaf.data))
        #results.append((leaf.data.similarity(ss), leaf.data))

    results.sort(key=lambda x: -x[0])  # reverse sort on similarity
    for (similarity, query) in results:
        print('{:.2f} {}'.format(similarity, query.name()))

    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matches to "{}"', outname)
        sig.save_signatures([m for (sim, m) in results], args.save_matches)
Example #4
0
    def sbt_gather(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('query')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('--csv', type=argparse.FileType('wt'))

        sourmash_args.add_moltype_args(parser)

        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        sl = sig.load_signatures(args.query, select_ksize=args.ksize,
                                 select_moltype=moltype)
        sl = list(sl)
        if len(sl) != 1:
            print('When loading query from "{}",'.format(args.query),
                  file=sys.stderr)
            print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl)))
            sys.exit(-1)

        query = sl[0]

        query_moltype = 'UNKNOWN'
        if query.estimator.is_molecule_type('dna'):
            query_moltype = 'DNA'
        elif query.estimator.is_molecule_type('protein'):
            query_moltype = 'protein'
        query_ksize = query.estimator.ksize
        print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                      query_ksize,
                                                      query_moltype))

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        #s = sig.load_signatures(args.query, select_ksize=args.ksize)
        orig_query = query

        sum_found = 0.
        found = []
        while 1:
            search_fn = SearchMinHashesFindBest().search

            results = []
            # use super low threshold for this part of the search
            for leaf in tree.find(search_fn, query, 0.00001):
                results.append((query.similarity(leaf.data), leaf.data))
                #results.append((leaf.data.similarity(ss), leaf.data))

            if not len(results):          # no matches at all!
                break

            # take the best result
            results.sort(key=lambda x: -x[0])   # reverse sort on similarity
            best_sim, best_ss = results[0]
            sim = best_ss.similarity(orig_query)

            # adjust by size of leaf (kmer cardinality of original genome)
            if best_ss.estimator.hll:
                leaf_kmers = best_ss.estimator.hll.estimate_cardinality()
                query_kmers = orig_query.estimator.hll.estimate_cardinality()
                f_of_total = leaf_kmers / query_kmers * sim
            else:
                f_of_total = 0

            if not found and sim < args.threshold:
                print('best match: {}'.format(best_ss.name()))
                print('similarity is {:.5f} of db signature;'.format(sim))
                print('this is below specified threshold => exiting.')
                break

            # subtract found hashes from search hashes, construct new search
            new_mins = set(query.estimator.mh.get_mins())
            found_mins = best_ss.estimator.mh.get_mins()

            # print interim & save
            print('found: {:.2f} {} {}'.format(f_of_total,
                                               len(new_mins),
                                               best_ss.name()))
            found.append((f_of_total, best_ss, sim))
            sum_found += f_of_total

            new_mins -= set(found_mins)
            e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
            for m in new_mins:
                e.mh.add_hash(m)
            new_ss = sig.SourmashSignature('foo', e)
            query = new_ss

        print('found {}, total fraction {:.3f}'.format(len(found), sum_found))
        print('')

        if not found:
            sys.exit(0)

        found.sort()
        found.reverse()

        print('Composition:')
        for (frac, leaf_sketch, sim) in found:
            print('{:.2f} {}'.format(frac, leaf_sketch.name()))

        if args.output:
            print('Composition:', file=args.output)
            for (frac, leaf_sketch, sim) in found:
                print('{:.2f} {}'.format(frac, leaf_sketch.name()),
                      file=args.output)

        if args.csv:
            fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers']
            w = csv.DictWriter(args.csv, fieldnames=fieldnames)

            w.writeheader()
            for (frac, leaf_sketch, sim) in found:
                cardinality = leaf_sketch.estimator.hll.estimate_cardinality()
                w.writerow(dict(fraction=frac, name=leaf_sketch.name(),
                                similarity=sim,
                                sketch_kmers=cardinality))
Example #5
0
    def categorize(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('queries', nargs='+')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.08, type=float)
        parser.add_argument('--traverse-directory', action="store_true")

        sourmash_args.add_moltype_args(parser)

        parser.add_argument('--csv', type=argparse.FileType('at'))
        parser.add_argument('--load-csv', default=None)
        
        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        already_names = set()
        if args.load_csv:
            with open(args.load_csv, 'rt') as fp:
                r = csv.reader(fp)
                for row in r:
                    already_names.add(row[0])

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

        if args.traverse_directory:
            inp_files = []
            for dirname in args.queries:
                for root, dirs, files in os.walk(dirname):
                    for name in files:
                        if name.endswith('.sig'):
                            fullname = os.path.join(root, name)
                            if fullname not in already_names:
                                inp_files.append(fullname)
        else:
            inp_files = args.queries

        print('found {} files to query'.format(len(inp_files)))

        loader = sourmash_args.LoadSingleSignatures(inp_files,
                                                    args.ksize, moltype)
        for queryfile, query, query_moltype, query_ksize in loader:
            print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                          query_ksize,
                                                          query_moltype))

            results = []
            search_fn = SearchMinHashesFindBest().search

            for leaf in tree.find(search_fn, query, args.threshold):
                # ignore self
                if leaf.data.md5sum() != query.md5sum():
                    results.append((query.similarity(leaf.data), leaf.data))

            best_hit_sim = 0.0
            best_hit_query_name = ""
            if results:
                results.sort(key=lambda x: -x[0])   # reverse sort on similarity
                best_hit_sim, best_hit_query = results[0]
                print('for {}, found: {:.2f} {}'.format(query.name(),
                                                        best_hit_sim,
                                                        best_hit_query.name()))
                best_hit_query_name = best_hit_query.name()
            else:
                print('for {}, no match found'.format(query.name()))

            if args.csv:
                w = csv.writer(args.csv)
                w.writerow([queryfile, best_hit_query_name, best_hit_sim])

        if loader.skipped_ignore:
            print('skipped/ignore: {}'.format(loader.skipped_ignore))
        if loader.skipped_nosig:
            print('skipped/nosig: {}'.format(loader.skipped_nosig))
Example #6
0
    def sbt_search(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('query')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.08, type=float)
        parser.add_argument('--save-matches', type=argparse.FileType('wt'))
        parser.add_argument('--best-only', action='store_true')

        sourmash_args.add_moltype_args(parser)
        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        search_fn = search_minhashes
        if args.best_only:
            search_fn = SearchMinHashesFindBest().search

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        sl = sig.load_signatures(args.query, select_ksize=args.ksize,
                                 select_moltype=moltype)
        sl = list(sl)
        if len(sl) != 1:
            print('When loading query from "{}",'.format(args.query),
                  file=sys.stderr)
            print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl)))
            sys.exit(-1)

        query = sl[0]

        query_moltype = 'UNKNOWN'
        if query.estimator.is_molecule_type('dna'):
            query_moltype = 'DNA'
        elif query.estimator.is_molecule_type('protein'):
            query_moltype = 'protein'
        query_ksize = query.estimator.ksize
        print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                      query_ksize,
                                                      query_moltype))

        results = []
        for leaf in tree.find(search_fn, query, args.threshold):
            results.append((query.similarity(leaf.data), leaf.data))
            #results.append((leaf.data.similarity(ss), leaf.data))

        results.sort(key=lambda x: -x[0])   # reverse sort on similarity
        for (similarity, query) in results:
            print('{:.2f} {}'.format(similarity, query.name()))

        if args.save_matches:
            outname = args.save_matches.name
            print('saving all matches to "{}"'.format(outname))
            sig.save_signatures([ m for (sim, m) in results ],
                                args.save_matches)
Example #7
0
def categorize(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to load')
    parser.add_argument('queries',
                        nargs='+',
                        help='list of signatures to categorize')
    parser.add_argument('-k', '--ksize', type=int, default=None)
    parser.add_argument('--threshold', default=0.08, type=float)
    parser.add_argument('--traverse-directory', action="store_true")

    sourmash_args.add_moltype_args(parser)

    parser.add_argument('--csv', type=argparse.FileType('at'))
    parser.add_argument('--load-csv', default=None)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    already_names = set()
    if args.load_csv:
        with open(args.load_csv, 'rt') as fp:
            r = csv.reader(fp)
            for row in r:
                already_names.add(row[0])

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

    if args.traverse_directory:
        inp_files = set(sourmash_args.traverse_find_sigs(args.queries))
    else:
        inp_files = set(args.queries) - already_names

    inp_files = set(inp_files) - already_names

    notify('found {} files to query', len(inp_files))

    loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype)

    for queryfile, query, query_moltype, query_ksize in loader:
        notify('loaded query: {}... (k={}, {})',
               query.name()[:30], query_ksize, query_moltype)

        results = []
        search_fn = SearchMinHashesFindBest().search

        for leaf in tree.find(search_fn, query, args.threshold):
            if leaf.data.md5sum() != query.md5sum():  # ignore self.
                results.append((query.similarity(leaf.data), leaf.data))

        best_hit_sim = 0.0
        best_hit_query_name = ""
        if results:
            results.sort(key=lambda x: -x[0])  # reverse sort on similarity
            best_hit_sim, best_hit_query = results[0]
            notify('for {}, found: {:.2f} {}', query.name(), best_hit_sim,
                   best_hit_query.name())
            best_hit_query_name = best_hit_query.name()
        else:
            notify('for {}, no match found', query.name())

        if args.csv:
            w = csv.writer(args.csv)
            w.writerow([queryfile, best_hit_query_name, best_hit_sim])

    if loader.skipped_ignore:
        notify('skipped/ignore: {}', loader.skipped_ignore)
    if loader.skipped_nosig:
        notify('skipped/nosig: {}', loader.skipped_nosig)
Example #8
0
def search(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('query', help='query signature')
    parser.add_argument('databases',
                        help='signatures/SBTs to search',
                        nargs='+')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('--threshold',
                        default=0.08,
                        type=float,
                        help='minimum threshold for reporting matches')
    parser.add_argument('--save-matches',
                        type=argparse.FileType('wt'),
                        help='output matching signatures to this file.')
    parser.add_argument(
        '--best-only',
        action='store_true',
        help='report only the best match (with greater speed).')
    parser.add_argument('-n',
                        '--num-results',
                        default=3,
                        type=int,
                        help='number of results to report')
    parser.add_argument('--containment',
                        action='store_true',
                        help='evaluate containment rather than similarity')
    parser.add_argument(
        '--scaled',
        type=float,
        help='downsample query to this scaled factor (yields greater speed)')
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('wt'),
                        help='output CSV containing matches to this file')

    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    # set up the query.
    query = sourmash_args.load_query_signature(args.query,
                                               select_ksize=args.ksize,
                                               select_moltype=moltype)
    query_moltype = sourmash_args.get_moltype(query)
    query_ksize = query.minhash.ksize
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query_ksize, query_moltype)

    # downsample if requested
    if args.scaled:
        if query.minhash.max_hash == 0:
            error('cannot downsample a signature not created with --scaled')
            sys.exit(-1)

        notify('downsampling query from scaled={} to {}', query.minhash.scaled,
               int(args.scaled))
        query.minhash = query.minhash.downsample_scaled(args.scaled)

    # set up the search function(s)
    search_fn = search_minhashes

    # similarity vs containment
    query_similarity = lambda x: query.similarity(x, downsample=True)
    if args.containment:
        query_similarity = lambda x: query.contained_by(x, downsample=True)

    # set up the search databases
    databases = sourmash_args.load_sbts_and_sigs(args.databases, query_ksize,
                                                 query_moltype)

    if not len(databases):
        error('Nothing found to search!')
        sys.exit(-1)

    # collect results across all the trees
    SearchResult = namedtuple('SearchResult',
                              'similarity, match_sig, md5, filename, name')
    results = []
    found_md5 = set()
    for (sbt_or_siglist, filename, is_sbt) in databases:
        if args.best_only:
            search_fn = SearchMinHashesFindBest().search

        if is_sbt:
            tree = sbt_or_siglist
            notify('Searching SBT {}', filename)
            for leaf in tree.find(search_fn, query, args.threshold):
                similarity = query_similarity(leaf.data)
                if similarity >= args.threshold and \
                       leaf.data.md5sum() not in found_md5:
                    sr = SearchResult(similarity=similarity,
                                      match_sig=leaf.data,
                                      md5=leaf.data.md5sum(),
                                      filename=filename,
                                      name=leaf.data.name())
                    found_md5.add(sr.md5)
                    results.append(sr)

        else:  # list of signatures
            for ss in sbt_or_siglist:
                similarity = query_similarity(ss)
                if similarity >= args.threshold and \
                       ss.md5sum() not in found_md5:
                    sr = SearchResult(similarity=similarity,
                                      match_sig=ss,
                                      md5=ss.md5sum(),
                                      filename=filename,
                                      name=ss.name())
                    found_md5.add(sr.md5)
                    results.append(sr)

    # sort results on similarity (reverse)
    results.sort(key=lambda x: -x.similarity)

    if args.best_only:
        notify(
            "(truncated search because of --best-only; only trust top result")

    n_matches = len(results)
    if n_matches <= args.num_results:
        print_results('{} matches:'.format(len(results)))
    else:
        print_results('{} matches; showing first {}:', len(results),
                      args.num_results)
        n_matches = args.num_results

    # output!
    print_results("similarity   match")
    print_results("----------   -----")
    for sr in results[:n_matches]:
        pct = '{:.1f}%'.format(sr.similarity * 100)
        name = sr.match_sig._display_name(60)
        print_results('{:>6}       {}', pct, name)

    if args.output:
        fieldnames = ['similarity', 'name', 'filename', 'md5']
        w = csv.DictWriter(args.output, fieldnames=fieldnames)

        w.writeheader()
        for sr in results:
            d = dict(sr._asdict())
            del d['match_sig']
            w.writerow(d)

    # save matching signatures upon request
    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matched signatures to "{}"', outname)
        sig.save_signatures([sr.match_sig for sr in results],
                            args.save_matches)