Beispiel #1
0
    def _find_signatures(self,
                         minhash,
                         threshold,
                         containment=False,
                         ignore_scaled=False):
        """
        Do a Jaccard similarity or containment search, yield results.

        This is essentially a fast implementation of find that collects all
        the signatures with overlapping hash values. Note that similarity
        searches (containment=False) will not be returned in sorted order.
        """
        # make sure we're looking at the same scaled value as database
        if self.scaled > minhash.scaled:
            minhash = minhash.downsample_scaled(self.scaled)
        elif self.scaled < minhash.scaled and not ignore_scaled:
            # note that containment can be calculated w/o matching scaled.
            raise ValueError(
                "lca db scaled is {} vs query {}; must downsample".format(
                    self.scaled, minhash.scaled))

        query_mins = set(minhash.get_mins())

        # collect matching hashes for the query:
        c = Counter()
        for hashval in query_mins:
            idx_list = self.hashval_to_idx.get(hashval, [])
            for idx in idx_list:
                c[idx] += 1

        debug('number of matching signatures for hashes: {}', len(c))

        # for each match, in order of largest overlap,
        for idx, count in c.most_common():
            # pull in the hashes. This reconstructs & caches all input
            # minhashes, which is kinda memory intensive...!
            # NOTE: one future low-mem optimization could be to support doing
            # this piecemeal by iterating across all the hashes, instead.
            match_sig = self._signatures[idx]
            match_mh = match_sig.minhash
            match_size = len(match_mh)

            # calculate the containment or similarity
            if containment:
                score = count / len(query_mins)
            else:
                # query_mins is size of query signature
                # match_size is size of match signature
                # count is overlap
                score = count / (len(query_mins) + match_size - count)

            # ...and return.
            if score >= threshold:
                yield score, match_sig, self.filename
Beispiel #2
0
    def _signatures(self):
        "Create a _signatures member dictionary that contains {idx: sigobj}."
        from sourmash import MinHash, SourmashSignature

        is_protein = False
        is_hp = False
        is_dayhoff = False
        if self.moltype == 'protein':
            is_protein = True
        elif self.moltype == 'hp':
            is_hp = True
        elif self.moltype == 'dayhoff':
            is_dayhoff = True
        minhash = MinHash(n=0,
                          ksize=self.ksize,
                          scaled=self.scaled,
                          is_protein=is_protein,
                          hp=is_hp,
                          dayhoff=is_dayhoff)

        debug('creating signatures for LCA DB...')
        mhd = defaultdict(minhash.copy_and_clear)
        temp_vals = defaultdict(list)

        # invert the hashval_to_idx dictionary
        for (hashval, idlist) in self.hashval_to_idx.items():
            for idx in idlist:
                temp_hashes = temp_vals[idx]
                temp_hashes.append(hashval)

                # 50 is an arbitrary number. If you really want
                # to micro-optimize, list is resized and grow in this pattern:
                # 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
                # (from https://github.com/python/cpython/blob/b2b4a51f7463a0392456f7772f33223e57fa4ccc/Objects/listobject.c#L57)
                if len(temp_hashes) > 50:
                    mhd[idx].add_many(temp_hashes)

                    # Sigh, python 2... when it goes away,
                    # we can do `temp_hashes.clear()` instead.
                    del temp_vals[idx]

        # We loop temp_vals again to add any remainder hashes
        # (each list of hashes is smaller than 50 items)
        for sig, vals in temp_vals.items():
            mhd[sig].add_many(vals)

        sigd = {}
        for idx, mh in mhd.items():
            ident = self.idx_to_ident[idx]
            name = self.ident_to_name[ident]
            sigd[idx] = SourmashSignature(mh, name=name)

        debug('=> {} signatures!', len(sigd))
        return sigd
Beispiel #3
0
def make_lca_counts(dblist,
                    lowest_rank='phylum',
                    min_num=0,
                    min_hashes=5,
                    prefix='oddities'):
    """
    Collect counts of all the LCAs in the list of databases.
    """
    assert len(dblist) == 1

    keep_ranks = ['root']
    for rank in lca_utils.taxlist():
        keep_ranks.append(rank)
        if rank == lowest_rank:
            break
    print('keeping hashvals at following ranks:', keep_ranks)
    print('min number of lineages:', min_num)
    print('min number of shared hashes:', min_hashes)

    print('---')

    # gather all hashvalue assignments from across all the databases
    assignments = defaultdict(set)
    for lca_db in dblist:
        for hashval, idx_list in lca_db.hashval_to_idx.items():
            if min_num and len(idx_list) < min_num:
                continue

            for idx in idx_list:
                lid = lca_db.idx_to_lid.get(idx)
                if lid is not None:
                    lineage = lca_db.lid_to_lineage[lid]
                    assignments[hashval].add(lineage)

    # now convert to trees -> do LCA & counts
    counts = defaultdict(int)
    mixdict = defaultdict(set)
    for hashval, lineages in assignments.items():

        # for each list of tuple_info [(rank, name), ...] build
        # a tree that lets us discover lowest-common-ancestor.
        debug("{}", lineages)
        tree = lca_utils.build_tree(lineages)

        # now find either a leaf or the first node with multiple
        # children; that's our lowest-common-ancestor node.
        lca, reason = lca_utils.find_lca(tree)

        # find cross-superkingdom hashes, and record combinations of lineages
        # that have them.
        rank = 'root'
        if lca:
            rank = lca[-1].rank

        if rank in keep_ranks:
            xx = []
            for lineage in lineages:
                xx.append(tuple(lineage))
            xx = tuple(xx)

            mixdict[xx].add(hashval)

        counts[lca] += 1

    # sort on number of confused hash vals by combination of lineages.
    mixdict_items = list(mixdict.items())
    mixdict_items.sort(key=lambda x: -len(x[1]))

    confused_hashvals = set()

    fp = open(prefix + '.csv', 'wt')
    w = csv.writer(fp)
    w.writerow([
        'cluster', 'num_lineages', 'shared_kmers', 'ksize', 'rank', 'lca',
        'ident1', 'lineage1', 'ident2', 'lineage2'
    ])

    #
    # find candidate lineages, then evaluate pairwise intersections.
    #

    for cluster_n, (lineages, hashvals) in enumerate(mixdict_items):
        # insist on more than N hash vals
        if len(hashvals) < min_hashes:
            continue

        # display summary:
        print('cluster {} has {} assignments for {} hashvals / {} bp'.format(
            cluster_n, len(lineages), len(hashvals),
            dblist[0].scaled * len(hashvals)))
        confused_hashvals.update(hashvals)

        tree = lca_utils.build_tree(lineages)
        lca, reason = lca_utils.find_lca(tree)
        if lca:
            rank = lca[-1].rank
        else:
            rank = 'root'
        print('  rank & lca:', rank, lca_utils.display_lineage(lca))

        #        for lineage_n, lineage in enumerate(lineages):
        #            print('* ', lca_utils.display_lineage(lineage))

        # now, identify all members of these lineages by their index:
        all_idxs = []
        for lineage_n, lineage in enumerate(lineages):
            lids = dblist[0].lineage_to_lids[lineage]
            for lid in lids:
                idxs = dblist[0].lid_to_idx[lid]
                all_idxs.extend(idxs)
                for idx in idxs:
                    ident = dblist[0].idx_to_ident[idx]

        # run through and look at all pairs of genomes in these lineages;
        # filter so that we're comparing across lineages with the right
        # LCA, and with significant intersection.
        pair_n = 0
        candidates = []
        for i in range(len(all_idxs)):
            idx1 = all_idxs[i]
            lid1 = dblist[0].idx_to_lid[idx1]
            lin1 = dblist[0].lid_to_lineage[lid1]
            for j in range(i):
                idx2 = all_idxs[j]
                lid2 = dblist[0].idx_to_lid[idx2]
                lin2 = dblist[0].lid_to_lineage[lid2]

                ident1 = dblist[0].idx_to_ident[idx1]
                ident2 = dblist[0].idx_to_ident[idx2]

                debug("{} x {}", ident1, ident2)

                this_tree = lca_utils.build_tree([lin1, lin2])
                this_lca, this_reason = lca_utils.find_lca(this_tree)

                # weed out pairs that don't have the desired intersection
                if lca != this_lca:
                    continue

                mh1 = dblist[0]._signatures[idx1]
                mh2 = dblist[0]._signatures[idx2]

                mins1 = set(mh1.get_mins())
                mins2 = set(mh2.get_mins())
                intersect_size = len(mins1.intersection(mins2))

                # weed out pairs that don't have enough k-mer intersection
                if intersect_size < min_hashes:
                    continue

                candidates.append(
                    (pair_n, ident1, lin1, ident2, lin2, intersect_size))

                # write summary to CSV for find-oddities-examine.py to use
                w.writerow([
                    'cluster{}.{}'.format(cluster_n, pair_n),
                    len(lineages), intersect_size * dblist[0].scaled,
                    dblist[0].ksize, rank,
                    lca_utils.display_lineage(lca), ident1,
                    lca_utils.display_lineage(lin1), ident2,
                    lca_utils.display_lineage(lin2)
                ])

                pair_n += 1

        print('  Candidate genome pairs for these lineages:')
        for pair_n, ident1, lin1, ident2, lin2, intersection_size in candidates:
            print('    cluster.pair {}.{} share {} bases'.format(
                cluster_n, pair_n, intersection_size * dblist[0].scaled))
            print('    - {} ({})'.format(ident1,
                                         lca_utils.display_lineage(lin1)))
            print('    - {} ({})'.format(ident2,
                                         lca_utils.display_lineage(lin2)))
            print('')

        print('')

    return counts, confused_hashvals
Beispiel #4
0
def index(args):
    """
    main function for building an LCA database.
    """
    if args.start_column < 2:
        error('error, --start-column cannot be less than 2')
        sys.exit(-1)

    set_quiet(args.quiet, args.debug)

    args.scaled = int(args.scaled)

    if args.ksize is None:
        args.ksize = DEFAULT_LOAD_K

    moltype = sourmash_args.calculate_moltype(args, default='DNA')

    notify('Building LCA database with ksize={} scaled={} moltype={}.',
           args.ksize, args.scaled, moltype)

    # first, load taxonomy spreadsheet
    delimiter = ','
    if args.tabs:
        delimiter = '\t'
    assignments, num_rows = load_taxonomy_assignments(args.csv,
                                               delimiter=delimiter,
                                               start_column=args.start_column,
                                               use_headers=not args.no_headers,
                                               force=args.force)

    notify('{} distinct identities in spreadsheet out of {} rows.',
           len(assignments), num_rows)
    notify('{} distinct lineages in spreadsheet out of {} rows.',
           len(set(assignments.values())), num_rows)

    db = LCA_Database(args.ksize, args.scaled, moltype)

#    notify('finding signatures...')
    if args.traverse_directory:
        yield_all_files = False           # only pick up *.sig files?
        if args.force:
            yield_all_files = True
        inp_files = list(sourmash_args.traverse_find_sigs(args.signatures,
                                                          yield_all_files=yield_all_files))
    else:
        inp_files = list(args.signatures)

    # track duplicates
    md5_to_name = {}

    #
    # main loop, connecting lineage ID to signature.
    #

    n = 0
    total_n = len(inp_files)
    record_duplicates = set()
    record_no_lineage = set()
    record_remnants = set(assignments)
    record_used_lineages = set()
    record_used_idents = set()
    n_skipped = 0
    for filename in inp_files:
        n += 1
        for sig in load_signatures(filename, ksize=args.ksize,
                                   select_moltype=moltype):
            notify(u'\r\033[K', end=u'')
            notify('\r... loading signature {} ({} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
            debug(filename, sig.name())

            # block off duplicates.
            if sig.md5sum() in md5_to_name:
                debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum())
                record_duplicates.add(filename)
                continue

            md5_to_name[sig.md5sum()] = sig.name()

            # parse identifier, potentially with splitting
            ident = sig.name()
            if args.split_identifiers: # hack for NCBI-style names, etc.
                # split on space...
                ident = ident.split(' ')[0]
                # ...and on period.
                ident = ident.split('.')[0]

            lineage = assignments.get(ident)

            # punt if no lineage and --require-taxonomy
            if lineage is None and args.require_taxonomy:
                debug('(skipping, because --require-taxonomy was specified)')
                n_skipped += 1
                continue

            # add the signature into the database.
            db.insert(sig, ident=ident, lineage=lineage)

            if lineage:
                # remove from our list of remaining ident -> lineage
                record_remnants.remove(ident)

                # track ident as used
                record_used_idents.add(ident)
                record_used_lineages.add(lineage)

            # track lineage info - either no lineage, or this lineage used.
            else:
                debug('WARNING: no lineage assignment for {}.', ident)
                record_no_lineage.add(ident)

    # end main add signatures loop

    if n_skipped:
        notify('... loaded {} signatures; skipped {} because of --require-taxonomy.', total_n, n_skipped)
    else:
        notify('... loaded {} signatures.', total_n)

    # check -- did we find any signatures?
    if n == 0:
        error('ERROR: no signatures found. ??')
        if args.traverse_directory and not args.force:
            error('(note, with --traverse-directory, you may want to use -f)')
        sys.exit(1)

    # check -- did the signatures we found have any hashes?
    if not db.hashval_to_idx:
        error('ERROR: no hash values found - are there any signatures?')
        sys.exit(1)
    notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx),
           args.ksize, args.scaled)

    # summarize:
    notify('{} assigned lineages out of {} distinct lineages in spreadsheet.',
           len(record_used_lineages), len(set(assignments.values())))
    unused_lineages = set(assignments.values()) - record_used_lineages

    notify('{} identifiers used out of {} distinct identifiers in spreadsheet.',
           len(record_used_idents), len(set(assignments)))

    assert record_used_idents.issubset(set(assignments))
    unused_identifiers = set(assignments) - record_used_idents

    # now, save!
    db_outfile = args.lca_db_out
    if not (db_outfile.endswith('.lca.json') or \
                db_outfile.endswith('.lca.json.gz')):   # logic -> db.save
        db_outfile += '.lca.json'
    notify('saving to LCA DB: {}'.format(db_outfile))

    db.save(db_outfile)

    ## done!

    # output a record of stuff if requested/available:
    if record_duplicates or record_no_lineage or record_remnants or unused_lineages:
        if record_duplicates:
            notify('WARNING: {} duplicate signatures.', len(record_duplicates))
        if record_no_lineage:
            notify('WARNING: no lineage provided for {} signatures.',
                   len(record_no_lineage))
        if record_remnants:
            notify('WARNING: no signatures for {} spreadsheet rows.',
                   len(record_remnants))
        if unused_lineages:
            notify('WARNING: {} unused lineages.', len(unused_lineages))

        if unused_identifiers:
            notify('WARNING: {} unused identifiers.', len(unused_identifiers))

        if args.report:
            notify("generating a report and saving in '{}'", args.report)
            generate_report(record_duplicates, record_no_lineage,
                            record_remnants, unused_lineages,
                            unused_identifiers, args.report)
        else:
            notify('(You can use --report to generate a detailed report.)')