Example #1
0
def gather_at_rank(mh, lca_db, lin_db, match_rank):
    "Run gather, and aggregate at given rank."
    import copy
    minhash = copy.copy(mh)
    query_sig = sourmash.SourmashSignature(minhash)

    # do the gather:
    counts = Counter()
    while 1:
        results = lca_db.gather(query_sig, threshold_bp=0)
        if not results:
            break

        (match, match_sig, _) = results[0]

        # retrieve lineage & pop to match_rank
        match_ident = get_ident(match_sig)
        match_lineage = lin_db.ident_to_lineage[match_ident]
        match_lineage = pop_to_rank(match_lineage, match_rank)

        # count at match_rank
        common = match_sig.minhash.count_common(query_sig.minhash)
        counts[match_lineage] += common

        # finish out gather algorithm!
        minhash.remove_many(match_sig.minhash.hashes)
        query_sig = sourmash.SourmashSignature(minhash)

    # return!
    for lin, count in counts.most_common():
        yield lin, count
Example #2
0
def intersect(args):
    """
    intersect one or more signatures by taking the intersection of hashes.

    This function always removes abundances.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mins = None
    total_loaded = 0

    for sigfile in args.signatures:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            if first_sig is None:
                first_sig = sigobj
                mins = set(sigobj.minhash.get_mins())

            mins.intersection_update(sigobj.minhash.get_mins())
            total_loaded += 1
        notify('loaded and intersected signatures from {}...', sigfile, end='\r')

    if total_loaded == 0:
        error("no signatures to merge!?")
        sys.exit(-1)

    # forcibly turn off track_abundance, unless --abundances-from set.
    if not args.abundances_from:
        intersect_mh = first_sig.minhash.copy_and_clear()
        intersect_mh.track_abundance = False
        intersect_mh.add_many(mins)
        intersect_sigobj = sourmash.SourmashSignature(intersect_mh)
    else:
        notify('loading signature from {}, keeping abundances',
               args.abundances_from)
        abund_sig = sourmash.load_one_signature(args.abundances_from,
                                                ksize=args.ksize,
                                                select_moltype=moltype)
        if not abund_sig.minhash.track_abundance:
            error("--track-abundance not set on loaded signature?! exiting.")
            sys.exit(-1)
        intersect_mh = abund_sig.minhash.copy_and_clear()
        abund_mins = abund_sig.minhash.get_mins(with_abundance=True)

        # do one last intersection
        mins.intersection_update(abund_mins)
        abund_mins = { k: abund_mins[k] for k in mins }

        intersect_mh.set_abundances(abund_mins)
        intersect_sigobj = sourmash.SourmashSignature(intersect_mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([intersect_sigobj], fp=fp)

    notify('loaded and intersected {} signatures', total_loaded)
Example #3
0
def sig_import(args):
    """
    import a signature into sourmash format.
    """
    set_quiet(args.quiet)

    siglist = []
    for filename in args.filenames:
        with open(filename) as fp:
            x = json.loads(fp.read())

        ksize = x['kmer']
        num = x['sketchSize']

        assert x['hashType'] == "MurmurHash3_x64_128"
        assert x['hashBits'] == 64
        assert x['hashSeed'] == 42

        xx = x['sketches'][0]
        hashes = xx['hashes']

        mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False)
        mh.add_many(hashes)

        s = sourmash.SourmashSignature(mh, filename=filename)
        siglist.append(s)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(siglist, fp)
Example #4
0
def test_calculate_containment_at_rank_3():
    # two lineages with overlapping hashes (50% containment)
    hashval1 = 12345678
    ident = 'uniq'
    mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident, 'a;b;c')
    lin2 = lca_utils.make_lineage('a;d')
    hashval2 = 87654321
    match_rank = "genus"
    # make lineage hashD
    lineage_hashD = defaultdict(test_gen_mh)
    lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval1], lin1,
                                        match_rank)
    lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval2], lin2,
                                        match_rank)

    # make query sig
    mh = make_mh([hashval1, hashval2])
    query_sig = sourmash.SourmashSignature(mh, name='query')

    # calculate containment
    containmentD = calculate_containment_at_rank(lineage_hashD, query_sig,
                                                 match_rank)

    # superkingdom lineage that should have 100% containment
    lin3 = lca_utils.make_lineage('a')
    assert containmentD["superkingdom"][0][1] == 1.0
    # class should have 50% containment
    assert containmentD["class"][0][1] == 0.5
    phylum_containment = [
        containmentD["phylum"][0][1], containmentD["phylum"][1][1]
    ]
    assert [0.5, 0.5] == phylum_containment
def main():
    p = argparse.ArgumentParser()
    p.add_argument('contigs')  # this is an assembly
    p.add_argument('read_sig')  #  this contains sourmash sig with abunds
    p.add_argument('-o', '--output', required=True)
    args = p.parse_args()

    siglist = sourmash.load_file_as_signatures(args.read_sig)
    siglist = list(siglist)
    assert len(siglist) == 1
    sig = siglist[0]

    contigs_mh = sig.minhash.copy_and_clear()
    for record in screed.open(args.contigs):
        contigs_mh.add_sequence(record.sequence, force=True)

    # intersect the genome assembly with the read abundances
    # so now we get the abundances of only the k-mers that are in the
    # assembly.
    abunds = {}
    for hashval in contigs_mh.hashes:
        abunds[hashval] = sig.minhash.hashes.get(hashval, 0)

    output_mh = sig.minhash.copy_and_clear()
    output_mh.set_abundances(abunds)

    out_sig = sourmash.SourmashSignature(output_mh)
    with open(args.output, 'wt') as fp:
        print(f"Saving output to '{args.output}'")
        sourmash.save_signatures([out_sig], fp)
Example #6
0
def test_calculate_containment_at_rank_4():
    # add two (nonmatching) hashvals to query
    hashval1 = 12345678
    ident = 'uniq'
    mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident, 'a;b;c')
    lin2 = lca_utils.make_lineage('a;d')
    hashval2 = 87654321
    match_rank = "genus"
    # make lineage hashD
    lineage_hashD = defaultdict(test_gen_mh)
    lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval1], lin1,
                                        match_rank)
    lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval2], lin2,
                                        match_rank)

    # make query sig
    mh = make_mh([hashval1, hashval2, 33333333, 44444444])
    query_sig = sourmash.SourmashSignature(mh, name='query')

    # calculate containment
    containmentD = calculate_containment_at_rank(lineage_hashD, query_sig,
                                                 match_rank)

    # superkingdom lineage that should have 50% containment
    lin3 = lca_utils.make_lineage('a')
    assert containmentD["superkingdom"][0][1] == 0.5
    # each class should have 25% containment
    assert containmentD["class"][0][1] == 0.25
    assert [containmentD["phylum"][0][1],
            containmentD["phylum"][1][1]] == [0.25, 0.25]
Example #7
0
def build_signature(p):
    header, seq = p
    mg_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100)
    mg_minhash.add_sequence(str(seq), force=True)
    mg_sig = sourmash.SourmashSignature(mg_minhash, name=header)

    return mg_sig
Example #8
0
def load_or_generate_sig_from_file(input_file,
                                   alphabet,
                                   ksize,
                                   scaled,
                                   ignore_abundance=False,
                                   translate=False):
    sig = ""
    if input_file.endswith(".sig"):
        # do I want to enable multiple sigs per file here?
        sig = sourmash.load_one_signature(input_file, ksize=ksize)
    else:
        # read file and add sigs
        records = try_reading_fasta_file(input_file)
        # build signature name from filename .. maybe just keep filename?
        #signame = os.path.basename(input_file.rsplit("_", 1)[0])
        # start with fresh minhash
        mh = determine_appropriate_fresh_minhash(alphabet, ksize, scaled,
                                                 ignore_abundance)
        if records:
            for record in records:
                if alphabet == "nucleotide" or translate:
                    mh.add_sequence(record.sequence, force=True)
                else:
                    mh.add_protein(record.sequence)
            # minhash --> signature, using filename as signature name ..i think this happens automatically if don't provide name?
            sig = sourmash.SourmashSignature(mh,
                                             name=os.path.basename(input_file))
    return sig
Example #9
0
def test_sort_by_rank_and_containment_2():
    # 1. three results, check that they sort by rank, containment
    hashval1 = 12345678
    ident = 'uniq'
    mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident, 'a;b;c')
    lin2 = lca_utils.make_lineage('a;d')
    hashval2 = 87654321
    hashval3 = 33333333
    match_rank = "genus"
    # make lineage hashD
    lineage_hashD = defaultdict(test_gen_mh)
    lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval1, hashval3],
                                        lin1, match_rank)
    lineage_hashD = add_hashes_at_ranks(lineage_hashD, [hashval2], lin2,
                                        match_rank)
    # make query sig
    mh = make_mh([hashval1, hashval2, hashval3, 44444444])
    query_sig = sourmash.SourmashSignature(mh, name='query')
    superK_lin = lca_utils.make_lineage('a')
    phylum_match_lin = lca_utils.make_lineage('a;b')
    # calculate containment
    containmentD = calculate_containment_at_rank(lineage_hashD, query_sig,
                                                 match_rank)
    sorted_results = sort_by_rank_and_containment(containmentD, match_rank)
    assert sorted_results[0].lineage == superK_lin
    assert sorted_results[0].contained_at_rank == 0.75
    # phylum results should also be sorted by containment
    assert sorted_results[1].lineage[-1].rank == "phylum"
    assert sorted_results[1].contained_at_rank == 0.5
    assert sorted_results[2].lineage[-1].rank == "phylum"
    assert sorted_results[2].contained_at_rank == 0.25
    # class results
    assert sorted_results[3].lineage[-1].rank == "class"
    assert sorted_results[3].contained_at_rank == 0.5
    def write(self, csv_writer, csvoutfp, outdir):
        hashval = self.query_hashval
        bp = self.total_bp
        seqs = self.total_seq

        # output to results.csv!
        csv_writer.writerow([hashval, bp, seqs])
        csvoutfp.flush()

        # TR add contigs folder
        # write out cDBG IDs
        q_name = str(hashval)
        cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz'
        with gzip.open(os.path.join(outdir, "contigs", cdbg_listname),
                       'wt') as fp:
            fp.write("\n".join([str(x) for x in sorted(self.cdbg_shadow)]))

        # write out contigs
        contigs_outname = os.path.basename(q_name) + '.contigs.fa.gz'
        with gzip.open(os.path.join(outdir, "contigs", contigs_outname),
                       'wt') as fp:
            for name, sequence in self.contigs:
                fp.write('>{}\n{}\n'.format(name, sequence))

        # save minhash?
        if self.mh:
            ss = sourmash.SourmashSignature(
                self.mh, name='hashval query:{}'.format(q_name))

            sigfile = os.path.join(outdir, "contigs", q_name + '.contigs.sig')
            with open(sigfile, 'wt') as fp:
                sourmash.save_signatures([ss], fp)
Example #11
0
    def __init__(self, query_file, ksize, scaled, catlas_name, debug=True):
        self.filename = query_file
        self.ksize = ksize
        self.kmers = set()
        self.name = None
        mh = MinHash(0, ksize, scaled=scaled)
        self.mh = mh
        self.catlas_name = catlas_name
        self.debug = debug

        notify('----')
        notify('QUERY FILE: {}', self.filename)

        # build hashes for all the query k-mers & create signature
        notify('loading query kmers...', end=' ')
        bf = khmer.Nodetable(ksize, 1, 1)

        for record in screed.open(self.filename):
            if self.name is None:
                self.name = record.name
            if len(record.sequence) >= int(ksize):
                self.kmers.update(bf.get_kmer_hashes(record.sequence))
            mh.add_sequence(record.sequence, True)

        self.sig = sourmash.SourmashSignature(mh,
                                              name=self.name,
                                              filename=self.filename)

        notify('got {} k-mers from query', len(self.kmers))

        self.cdbg_match_counts = {}
        self.catlas_match_counts = {}
Example #12
0
def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig
Example #13
0
def merge(args):
    """
    merge one or more signatures.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mh = None
    total_loaded = 0

    # iterate over all the sigs from all the files.
    for sigfile in args.signatures:
        notify('loading signatures from {}...', sigfile, end='\r')
        this_n = 0
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            # first signature? initialize a bunch of stuff
            if first_sig is None:
                first_sig = sigobj
                mh = first_sig.minhash.copy_and_clear()

                # forcibly remove abundance?
                if args.flatten:
                    mh.track_abundance = False

            try:
                sigobj_mh = sigobj.minhash
                if not args.flatten:
                    _check_abundance_compatibility(first_sig, sigobj)
                else:
                    sigobj_mh.track_abundance = False

                mh.merge(sigobj_mh)
            except:
                error("ERROR when merging signature '{}' ({}) from file {}",
                      sigobj.name(), sigobj.md5sum()[:8], sigfile)
                raise

            this_n += 1
            total_loaded += 1
        if this_n:
            notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r')

    if not total_loaded:
        error("no signatures to merge!?")
        sys.exit(-1)

    merged_sigobj = sourmash.SourmashSignature(mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([merged_sigobj], fp=fp)

    notify('loaded and merged {} signatures', total_loaded)
Example #14
0
def intersect(args):
    """
    intersect one or more signatures by taking the intersection of hashes.

    This function always removes abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature intersect')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mins = None
    total_loaded = 0

    for sigfile in args.signatures:
        for sigobj in sourmash.load_signatures(sigfile,
                                               ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            if first_sig is None:
                first_sig = sigobj
                mins = set(sigobj.minhash.get_mins())

            mins.intersection_update(sigobj.minhash.get_mins())
            total_loaded += 1
        notify('loaded and intersected signatures from {}...',
               sigfile,
               end='\r')

    if total_loaded == 0:
        error("no signatures to merge!?")
        sys.exit(-1)

    # forcibly turn off track_abundance
    intersect_mh = first_sig.minhash.copy_and_clear()
    _flatten(intersect_mh)
    intersect_mh.add_many(mins)
    intersect_sigobj = sourmash.SourmashSignature(intersect_mh)

    output_json = sourmash.save_signatures([intersect_sigobj], fp=args.output)

    notify('loaded and intersected {} signatures', total_loaded)
def main():
    p = argparse.ArgumentParser()
    p.add_argument('node_mh_pickle')
    p.add_argument('lca_db')
    args = p.parse_args()

    node_mhs = pickle.load(open(args.node_mh_pickle, 'rb'))
    lca_obj = LCA_Database()
    lca_obj.load(args.lca_db)
    databases = ((lca_obj, args.lca_db, 'LCA'),)

    d = {}
    n_pure95 = 0
    total = 0

    for k, v in node_mhs.items():
        ss = sourmash.SourmashSignature(v)

        results = [ x[0] for x in gather_databases(ss, databases, 0, True) ]
        sum_f_uniq = sum([result.f_unique_to_query for result in results])

        keep_results = []
        for result in results:
            if result.f_unique_to_query < 0.10:
                break
            keep_results.append(result)

        if not keep_results:
            print('** no match for {}'.format(k))
            continue

        idents = [ result.name.split()[0].split('.')[0] for result in keep_results ]
        idxlist = [ lca_obj.ident_to_idx[ident] for ident in idents ]
        lidlist = [ lca_obj.idx_to_lid[idx] for idx in idxlist ]
        lineages = [ lca_obj.lid_to_lineage[lid] for lid in lidlist ]

        tree = lca_utils.build_tree(lineages)
        lca, reason = lca_utils.find_lca(tree)

        level = '*none*'
        if lca:
            level = lca[-1].rank

        lineage = ";".join(lca_utils.zip_lineage(lca, truncate_empty=True))

        this_f_uniq = sum([ result.f_unique_to_query for result in keep_results ])

        print('node {} matches {} @ {:.1f}'.format(k, level, this_f_uniq / sum_f_uniq * 100))

        if level in ('strain', 'genus', 'species') and this_f_uniq / sum_f_uniq >= 0.95:
            n_pure95 += 1
        total += 1

    print('XXX', n_pure95, total)
Example #16
0
def search_containment_at_rank(mh,
                               lca_db,
                               lin_db,
                               match_rank,
                               ignore_abundance=False,
                               summarize_at_ranks=True):
    "Run search --containment, and aggregate at given rank and above."

    results = []
    found_md5 = set()

    def gen_mh():
        return mh.copy_and_clear()

    lin_hashes = defaultdict(
        gen_mh)  #defaultdict requires function that defines an empty minhash
    query_hashes = set(mh.hashes)
    query_sig = sourmash.SourmashSignature(mh)

    # search
    search_iter = lca_db.search(query_sig, threshold=0, do_containment=True, \
                  ignore_abundance=ignore_abundance, best_only=False, unload_data=False)

    # iterate through matches
    for (similarity, match_sig, filename) in search_iter:
        md5 = match_sig.md5sum()
        if md5 not in found_md5:
            found_md5.add(md5)
            match_lineage = get_lineage_at_match_rank(lin_db, match_sig,
                                                      match_rank)
            results.append((similarity, match_sig, filename,
                            match_lineage))  #store search results + lineage

            if summarize_at_ranks:
                # Keep track of matched hashes at higher taxonomic ranks
                intersected_hashes = query_hashes.intersection(
                    set(match_sig.minhash.hashes))
                lin_hashes = add_hashes_at_ranks(lin_hashes,
                                                 intersected_hashes,
                                                 match_lineage, match_rank)

    # sort and store results
    search_results = sort_and_store_search_results(results)
    search_results_at_rank = []
    if summarize_at_ranks:
        rank_containment = calculate_containment_at_rank(
            lin_hashes, query_sig, match_rank)
        search_results_at_rank = sort_by_rank_and_containment(
            rank_containment, match_rank)

    return search_results, search_results_at_rank
Example #17
0
def subtract(args):
    """
    subtract one or more signatures from another
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    from_sigfile = args.signature_from
    from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype)

    from_mh = from_sigobj.minhash
    if from_mh.track_abundance and not args.flatten:
        error('Cannot use subtract on signatures with abundance tracking, sorry!')
        sys.exit(1)

    subtract_mins = set(from_mh.get_mins())

    notify('loaded signature from {}...', from_sigfile, end='\r')

    total_loaded = 0
    for sigfile in args.subtraction_sigs:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):

            if sigobj.minhash.track_abundance and not args.flatten:
                error('Cannot use subtract on signatures with abundance tracking, sorry!')
                sys.exit(1)

            subtract_mins -= set(sigobj.minhash.get_mins())

            notify('loaded and subtracted signatures from {}...', sigfile, end='\r')
            total_loaded += 1

    if not total_loaded:
        error("no signatures to subtract!?")
        sys.exit(-1)


    subtract_mh = from_sigobj.minhash.copy_and_clear()
    subtract_mh.add_many(subtract_mins)

    subtract_sigobj = sourmash.SourmashSignature(subtract_mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([subtract_sigobj], fp=fp)

    notify('loaded and subtracted {} signatures', total_loaded)
Example #18
0
def create_signatures(file_list, ksize=21, verbose=False):
    file_list = [Path(str(f) + '.sig') for f in file_list]
    gt = GenomeTools()
    if verbose:
        file_list = tqdm(file_list, total=len(file_list))
    for f in file_list:
        if f.is_file():
            sig = sourmash.load_one_signature(str(f))
            if sig.minhash.ksize == ksize:
                continue
        minhash = sourmash.MinHash(n=1000, ksize=ksize)
        genome = gt.read_fasta(f.with_suffix(''))
        minhash.add_sequence(genome, True)
        sig = sourmash.SourmashSignature(minhash, name=f.stem)
        with f.open('wt') as handle:
            sourmash.save_signatures([sig], handle)
    def write(self, csv_writer, csvoutfp, outdir, catlas_name):
        containment = self.containment()
        similarity = self.similarity()
        q_name = self.query.filename
        bp = self.total_bp
        seqs = self.total_seq
        k = self.query.ksize
        num_q_kmers = len(self.query.kmers)
        (best_con,
         cdbg_min_oh,
         catlas_min_oh) = self.query.con_sim_upper_bounds(self.catlas,
                                                          self.kmer_idx)
        # output to results.csv!
        csv_writer.writerow([q_name, containment, similarity, bp,
                             seqs, k, num_q_kmers,
                             best_con, cdbg_min_oh,
                             catlas_min_oh, catlas_name])
        csvoutfp.flush()

        # write out signature from retrieved contigs.
        sig_filename = os.path.basename(q_name) + '.contigs.sig'
        with open(os.path.join(outdir, sig_filename), 'wt') as fp:
            ss = sourmash.SourmashSignature(self.contigs_minhash,
                                                name='nbhd:'+self.query.name,
                                                filename=sig_filename)
            sourmash.save_signatures([ss], fp)

        # write out cDBG IDs
        cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz'
        with gzip.open(os.path.join(outdir, cdbg_listname), 'wt') as fp:
            fp.write("\n".join([str(x) for x in sorted(self.shadow)]))

        # write out catlas nodes
        frontier_listname = os.path.basename(q_name) + '.frontier.txt.gz'
        with gzip.open(os.path.join(outdir, frontier_listname), 'wt') as fp:
            for node in sorted(self.leaves):
                fp.write('{}\n'.format(node))

        # write response curve
        response_curve_filename = os.path.basename(q_name) + '.response.txt'
        response_curve_filename = os.path.join(outdir,
                                               response_curve_filename)
        cdbg_match_counts = self.query.cdbg_match_counts[self.catlas.name]
        search_utils.output_response_curve(response_curve_filename,
                                           cdbg_match_counts,
                                           self.kmer_idx,
                                           self.catlas.layer1_to_cdbg)
Example #20
0
def calculate_containment_at_rank(lineage_hashD, query_sig, match_rank):
    # calculate containment for each lineage match at each rank
    summarized_results = defaultdict(list)
    scaled_val = int(query_sig.minhash.scaled)
    ksize = int(query_sig.minhash.ksize)
    for lin, matched_hashes in lineage_hashD.items():
        rank = lin[-1].rank
        # TODO; check this. just scaled_val, or scaled * ksize * num matched hashes?
        #intersect_bp = scaled_val * len(matched_hashes) * ksize
        intersect_bp = get_match_bp(scaled_val,
                                    ksize,
                                    num_matched_hashes=len(matched_hashes))
        linmatch_sig = sourmash.SourmashSignature(
            matched_hashes
        )  #ADD MORE INFO (e.g. name/ident?) HERE IF KEEPING SIG?
        containment = query_sig.contained_by(linmatch_sig)
        summarized_results[rank].append(
            (lin, containment, intersect_bp,
             linmatch_sig))  # optionally don't keep track of sig here
    return summarized_results
Example #21
0
def compare_sigs(sag_id, sag_file, mhr_path, sig_path, mg_sig_list,
                 jacc_threshold):
    sag_subcontigs = s_utils.get_seqs(sag_file)
    if isfile(o_join(mhr_path, sag_id + '.mhr_recruits.tsv')):
        logging.info('[SABer]: Loading %s and MetaG signature recruit list\n' %
                     sag_id)
        with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'),
                  'r') as mhr_in:
            pass_list = [
                x.rstrip('\n').split('\t') for x in mhr_in.readlines()
            ]
    else:
        # Calculate\Load MinHash Signatures with SourMash for SAG subseqs
        if isfile(o_join(sig_path, sag_id + '.SAG.sig')):
            logging.info('[SABer]: Loading Signature for %s\n' % sag_id)
            sag_sig = sourmash.signature.load_one_signature(
                o_join(sig_path, sag_id + '.SAG.sig'))
        else:
            logging.info('[SABer]: Building Signature for %s\n' % sag_id)
            sag_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100)
            for sg_head in sag_subcontigs:
                sag_subseq = str(sag_subcontigs[sg_head].seq)
                sag_minhash.add_sequence(sag_subseq, force=True)
            sag_sig = sourmash.SourmashSignature(sag_minhash, name=sag_id)
            with open(o_join(sig_path, sag_id + '.SAG.sig'), 'w') as sags_out:
                sourmash.signature.save_signatures([sag_sig], fp=sags_out)
        logging.info('[SABer]: Comparing  %s and MetaG signature\n' % sag_id)
        pass_list = []
        for mg_sig in mg_sig_list:
            jacc_sim = mg_sig.similarity(sag_sig)
            mg_nm = mg_sig.name()
            if jacc_sim >= jacc_threshold:
                pass_list.append([sag_id, mg_nm, mg_nm.rsplit('_', 1)[0]])

        with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'),
                  'w') as mhr_out:
            mhr_out.write('\n'.join(['\t'.join(x) for x in pass_list]))
    pass_list = tuple(pass_list)

    return pass_list
Example #22
0
def sig_import(args):
    """
    import a signature into sourmash format.
    """
    p = SourmashArgumentParser(prog='sourmash signature import')
    p.add_argument('filenames', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    args = p.parse_args(args)
    set_quiet(args.quiet)

    siglist = []
    for filename in args.filenames:
        with open(filename) as fp:
            x = json.loads(fp.read())

        ksize = x['kmer']
        num = x['sketchSize']

        assert x['hashType'] == "MurmurHash3_x64_128"
        assert x['hashBits'] == 64
        assert x['hashSeed'] == 42

        xx = x['sketches'][0]
        hashes = xx['hashes']

        mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False)
        mh.add_many(hashes)

        s = sourmash.SourmashSignature(mh, filename=filename)
        siglist.append(s)

    sourmash.save_signatures(siglist, args.output)
Example #23
0
def add_singleton_sigs(sbt,
                       input_file,
                       ksize,
                       scaled,
                       alphabet,
                       ignore_abundance=False,
                       translate=False):
    if input_file.endswith(".sig"):
        sigs = sourmash.signature.load_signatures(input_file,
                                                  ksize=ksize,
                                                  select_moltype=alphabet)
        for sig in sigs:
            if sig.minhash:
                leaf = SigLeaf(sig.md5sum(), sig)
                sbt.add_node(leaf)
        # loop through and add each to sbt
    else:
        # read file and add sigs
        records = try_reading_fasta_file(input_file)
        # start with fresh minhash
        if records:
            for n, record in enumerate(records):
                signame = (record.name).rsplit("\t", 1)[0]
                if n % 10000 == 0:
                    sys.stderr.write(f"... building {n}th sig, {signame}\n")

                mh = determine_appropriate_fresh_minhash(
                    alphabet, ksize, scaled, ignore_abundance)
                if alphabet == "nucleotide" or translate:
                    mh.add_sequence(record.sequence, force=True)
                else:
                    mh.add_protein(record.sequence)
            # minhash --> signature
                sig = sourmash.SourmashSignature(mh, name=signame)
                if sig.minhash:
                    leaf = SigLeaf(sig.md5sum(), sig)
                    sbt.add_node(leaf)
    return sbt
Example #24
0
            # write out hashes

            # let's try building a sig. we will use this sig later to intersect with sample-specific sigs
            new_mins = set(counts.keys())
            print(len(new_mins))
            with open(outhashes, "w") as out:
                for hsh in new_mins:
                    out.write(str(hsh) + '\n')
            if len(new_mins) > 0:
                minhash = MinHash(
                    n=0, ksize=ksize, scaled=scaled
                )  # scaled=1 so we keep all (though these were previously at some other scaled val)
                minhash.add_many(set(counts.keys()))
                # write sig to file
                sigobj = sourmash.SourmashSignature(
                    minhash,
                    name=f"aggregated_hashvals_above_{min_count}",
                    filename=f"generated with drop_unique_hashes.py")
                sigobjs += [sigobj]

## this part only handles one output file -- doesn't take care of case with many ksizes/moltypes
with open(outsig, 'wt') as sigout:
    sourmash.save_signatures(sigobjs, sigout)
    #notify('wrote signature to {}', args.output)

# write out hashes to a text file

# this part is from
# https://github.com/dib-lab/sourmash/blob/7661087aa0b0e81bfec82a58002463d7c699528a/utils/hashvals-to-signature.py

#ksize = int(snakemake.params.get("ksize", 7))
#do some checking here?
Example #25
0
def make_sig_and_lin(hashvals, ident, lin, ksize=3, scaled=1):
    mh = make_mh(hashvals)
    sig = sourmash.SourmashSignature(mh, name=ident)
    lineage = lca_utils.make_lineage(lin)
    return mh, sig, lineage
def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('output')
    p.add_argument('--minsize', type=float, default=100)
    p.add_argument('--maxsize', type=float, default=10000)
    p.add_argument('--keep-fraction', type=float, default=0.1)
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    args = p.parse_args(args)

    print('minsize: {:g}'.format(args.minsize))
    print('maxsize: {:g}'.format(args.maxsize))

    # load catlas DAG
    catlas = CAtlas(args.catlas_prefix, load_sizefile=True)
    print('loaded {} nodes from catlas {}'.format(len(catlas), catlas))
    print('loaded {} layer 1 catlas nodes'.format(len(catlas.layer1_to_cdbg)))

    # calculate the cDBG shadow sizes for each catlas node.
    print('decorating catlas with shadow size info.')
    catlas.decorate_with_shadow_sizes()

    # ok, the real work: look at articulation of cDBG graph.

    # find highest nodes with kmer size less than given max_size
    def find_terminal_nodes(node_id, max_size):
        node_list = set()
        for sub_id in catlas.children[node_id]:
            # shadow size
            size = catlas.kmer_sizes[sub_id]

            if size < max_size:
                node_list.add(sub_id)
            else:
                children = find_terminal_nodes(sub_id, max_size)
                node_list.update(children)

        return node_list

    print('finding terminal nodes for {}.'.format(args.maxsize))

    terminal = find_terminal_nodes(catlas.root, args.maxsize)
    print('...got {}'.format(len(terminal)))
    terminal = {n for n in terminal if catlas.kmer_sizes[n] > args.minsize}
    print('...down to {} between {} and {} in size.'.format(
        len(terminal), args.minsize, args.maxsize))

    # now, go through and calculate ratios
    x = []
    for node_id in terminal:
        # calculate: how many k-mers per cDBG node?
        kmer_size = catlas.kmer_sizes[node_id]
        shadow_size = catlas.shadow_sizes[node_id]

        ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2)

        # track basic info
        x.append((ratio, node_id, shadow_size, kmer_size))

    print('terminal node stats for maxsize: {:g}'.format(args.maxsize))
    print('n tnodes:', len(terminal))
    print('total k-mers:', catlas.kmer_sizes[catlas.root])

    x.sort(reverse=True)
    for (k, v, a, b) in x[:10]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)
    print('... eliding {} nodes'.format(len(x) - 20))
    for (k, v, a, b) in x[-10:]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)

    # keep the last keep-fraction (default 10%) for examination
    keep_sum_kmer = args.keep_fraction * catlas.kmer_sizes[catlas.root]
    sofar = 0
    keep_terminal = set()
    for (k, v, a, b) in reversed(x):
        sofar += b
        if sofar > keep_sum_kmer:
            break
        keep_terminal.add(v)

    print('keeping last {} k-mers worth of nodes for'
          'examination.'.format(sofar))

    # build cDBG shadow ID list.
    cdbg_shadow = catlas.shadow(keep_terminal)

    # extract contigs
    print('extracting contigs & building a sourmash signature')
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # track results as signature
    contigs_mh = sourmash.MinHash(n=0, ksize=args.ksize, scaled=1000)

    total_bp = 0
    total_seqs = 0

    outfp = open(args.output, 'wt')
    for n, record in enumerate(screed.open(contigs)):
        if n and n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_shadow)
            print('...at n {} ({:.1f}% of shadow)'.format(
                total_seqs, offset_f * 100),
                  end='\r')

        # contig names == cDBG IDs
        contig_id = int(record.name)
        if contig_id not in cdbg_shadow:
            continue

        outfp.write('>{}\n{}\n'.format(record.name, record.sequence))
        contigs_mh.add_sequence(record.sequence)

        # track retrieved sequences in a minhash
        total_bp += len(record.sequence)
        total_seqs += 1

    # done - got all contigs!
    print('')
    print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp))

    print('wrote contigs to {}'.format(args.output))
    with open(args.output + '.sig', 'wt') as fp:
        ss = sourmash.SourmashSignature(contigs_mh)
        sourmash.save_signatures([ss], fp)
Example #27
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('bcalm_unitigs')
    parser.add_argument('gxt_out')
    parser.add_argument('contigs_out')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-d', '--debug', action='store_true')
    parser.add_argument('-P',
                        '--pendants',
                        action="store_true",
                        help="don't remove low abundance pendants")
    parser.add_argument('-a',
                        '--abundance',
                        nargs='?',
                        type=float,
                        default=1.1)
    parser.add_argument('--randomize', help='randomize cDBG order')
    args = parser.parse_args(argv)

    k = args.ksize

    trim = not args.pendants
    trim_cutoff = args.abundance
    unitigs = args.bcalm_unitigs
    debug = args.debug

    if args.debug:
        logging.basicConfig(filename='bcalm_to_gxt.log',
                            filemode='w',
                            level=logging.DEBUG)
    else:
        logging.basicConfig(filename='bcalm_to_gxt.log',
                            filemode='w',
                            level=logging.WARNING)

    logging.debug("starting bcalm_to_gxt run.")

    gxtfp = open(args.gxt_out, 'wt')
    contigsfp = bgzf.open(args.contigs_out, 'wb')
    info_filename = args.contigs_out + '.info.csv'
    info_fp = open(info_filename, 'wt')
    in_mh = sourmash.MinHash(0, 31, scaled=1000)
    out_mh = sourmash.MinHash(0, 31, scaled=1000)

    # load in the basic graph structure from the BCALM output file
    neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k)

    # record input k-mers in a minhash
    for seq in sequences.values():
        in_mh.add_sequence(seq)

    # make order deterministic by reordering around min value of first, last,
    # and reverse complementing sequences appropriately
    print('reordering...')
    reordering = {}

    # first, put sequences in specific orientation
    sequence_list = []
    for key in neighbors:
        v = sequences[key]

        # pick lexicographically smaller of forward & reverse complement.
        v2 = screed.rc(v)
        if v > v2:
            v = v2
        sequence_list.append((v, key))
        del sequences[key]

    # sort all sequences:
    sequence_list.sort(reverse=True)
    if args.randomize:
        print('(!! randomizing order per --randomize !!)')
        random.shuffle(sequence_list)

    # ok, now remap all the things.
    remapping = {}
    new_sequences = {}

    # remap sequences
    new_key = 0
    while sequence_list:  # consume while iterating
        sequence, old_key = sequence_list.pop()
        remapping[old_key] = new_key
        new_sequences[new_key] = sequence
        new_key += 1

    # remap other things
    new_neighbors = collections.defaultdict(set)
    for old_key, vv in neighbors.items():
        new_vv = [remapping[v] for v in vv]
        new_neighbors[remapping[old_key]] = set(new_vv)

    new_mean_abunds = {}
    for old_key, value in mean_abunds.items():
        new_mean_abunds[remapping[old_key]] = value

    new_sizes = {}
    for old_key, value in sizes.items():
        new_sizes[remapping[old_key]] = value

    assert len(sequences) == 0
    print('...done')

    sequences = new_sequences
    mean_abunds = new_mean_abunds
    sizes = new_sizes
    neighbors = new_neighbors

    # if we are removing pendants, we need to relabel the contigs so they are
    # consecutive integers starting from 0.  If not, we create dummy data
    # structures to make the interface the same elsewhere in the data
    if trim:
        print('removing pendants...')
        non_pendants = set(v for v, N in neighbors.items()
                           if len(N) > 1 or mean_abunds[v] > trim_cutoff)
        contract_degree_two(non_pendants, neighbors, sequences, mean_abunds,
                            sizes, k)
    else:
        non_pendants = list(neighbors.keys())
    aliases = {x: i for i, x in enumerate(sorted(non_pendants))}
    n = len(aliases)

    # write out sequences & compute offsets
    offsets = {}
    kv_list = sorted(aliases.items(), key=lambda x: x[1])
    for x, i in kv_list:
        offsets[x] = contigsfp.tell()
        contigsfp.write('>{}\n{}\n'.format(i, sequences[x]))
        out_mh.add_sequence(sequences[x])
    contigsfp.close()

    print('... done! {} unitigs'.format(n))

    # start the gxt file by writing the number of nodes (unitigs))
    gxtfp.write('{}\n'.format(n))

    # write out all of the links, in 'from to' format.
    n_edges = 0
    for v, N in sorted(neighbors.items()):
        for u in sorted(N):
            gxtfp.write('{} {}\n'.format(aliases[v], aliases[u]))
            n_edges += 1

    print('{} vertices, {} edges'.format(n, n_edges))

    info_fp.write('contig_id,offset,mean_abund,n_kmers\n')
    for v, i in aliases.items():
        info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v],
                                                 sizes[v]))

    # output two sourmash signatures: one for input contigs, one for
    # output contigs.
    in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs)
    sourmash.save_signatures([in_sig], open(args.bcalm_unitigs + '.sig', 'wt'))

    out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out)
    sourmash.save_signatures([out_sig], open(args.contigs_out + '.sig', 'wt'))
Example #28
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('hashfile') 					# file that contains hashes
    p.add_argument('-o', '--output', default=None,
                   help='file to output signature to')
    p.add_argument('-k', '--ksize', default=None, type=int)
    p.add_argument('--scaled', default=None, type=int)
    p.add_argument('--num', default=None, type=int)
    p.add_argument('--name', default='', help='signature name')
    p.add_argument('--filename', default='',
                   help='filename to add to signature')
    args = p.parse_args()

    # check arguments.
    if args.scaled and args.num:
        error('cannot specify both --num and --scaled! exiting.')
        return -1

    if not args.ksize:
        error('must specify --ksize')
        return -1

    if not args.output:
        error('must specify --output')
        return -1

    # first, load in all the hashes
    hashes = set()
    for line in open(args.hashfile, 'rt'):
        hashval = int(line.strip())
        hashes.add(hashval)

    if not hashes:
        error("ERROR, no hashes loaded from {}!", args.hashfile)
        return -1

    notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile)

    # now, create the MinHash object that we'll use.
    scaled = 0
    num = 0
    if args.scaled:
        scaled = args.scaled
    elif args.num:
        num = args.num
    else:
        notify('setting --num automatically from the number of hashes.')
        num = len(hashes)

    # construct empty MinHash object according to args
    minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled)

    # add hashes into!
    minhash.add_many(hashes)

    if len(minhash) < len(hashes):
        notify("WARNING: loaded {} hashes, but only {} made it into MinHash.",
               len(hashes), len(minhash))
        if scaled:
            notify("This is probably because of the scaled argument.")
        elif args.num:
            notify("This is probably because your --num is set to {}",
                   args.num)

    if num > len(minhash):
        notify("WARNING: --num set to {}, but only {} hashes in signature.",
               num, len(minhash))

    sigobj = sourmash.SourmashSignature(minhash, name=args.name,
                                        filename=args.filename)

    with open(args.output, 'wt') as fp:
        sourmash.save_signatures([sigobj], fp)
    notify('wrote signature to {}', args.output)
Example #29
0
def subtract(args):
    """
    subtract one or more signatures from another
    """
    p = SourmashArgumentParser(prog='sourmash signature subtract')
    p.add_argument('signature_from')
    p.add_argument('subtraction_sigs', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--flatten',
                   action='store_true',
                   help='remove abundance from signatures before subtracting')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    from_sigfile = args.signature_from
    from_sigobj = sourmash.load_one_signature(from_sigfile,
                                              ksize=args.ksize,
                                              select_moltype=moltype)

    from_mh = from_sigobj.minhash
    if from_mh.track_abundance and not args.flatten:
        error(
            'Cannot use subtract on signatures with abundance tracking, sorry!'
        )
        sys.exit(1)

    subtract_mins = set(from_mh.get_mins())

    notify('loaded signature from {}...', from_sigfile, end='\r')

    total_loaded = 0
    for sigfile in args.subtraction_sigs:
        for sigobj in sourmash.load_signatures(sigfile,
                                               ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):

            if sigobj.minhash.track_abundance and not args.flatten:
                error(
                    'Cannot use subtract on signatures with abundance tracking, sorry!'
                )
                sys.exit(1)

            subtract_mins -= set(sigobj.minhash.get_mins())

            notify('loaded and subtracted signatures from {}...',
                   sigfile,
                   end='\r')
            total_loaded += 1

    if not total_loaded:
        error("no signatures to subtract!?")
        sys.exit(-1)

    subtract_mh = from_sigobj.minhash.copy_and_clear()
    subtract_mh.add_many(subtract_mins)

    subtract_sigobj = sourmash.SourmashSignature(subtract_mh)

    output_json = sourmash.save_signatures([subtract_sigobj], fp=args.output)

    notify('loaded and subtracted {} signatures', total_loaded)
Example #30
0
def merge(args):
    """
    merge one or more signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature merge')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--flatten',
                   action='store_true',
                   help='Remove abundances from all signatures.')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)

    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mh = None
    total_loaded = 0

    # iterate over all the sigs from all the files.
    for sigfile in args.signatures:
        notify('loading signatures from {}...', sigfile, end='\r')
        this_n = 0
        for sigobj in sourmash.load_signatures(sigfile,
                                               ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            # first signature? initialize a bunch of stuff
            if first_sig is None:
                first_sig = sigobj
                mh = first_sig.minhash.copy_and_clear()

                # forcibly remove abundance?
                if mh.track_abundance and args.flatten:
                    _flatten(mh)

            try:
                if not args.flatten:
                    _check_abundance_compatibility(first_sig, sigobj)

                mh.merge(sigobj.minhash)
            except:
                error("ERROR when merging signature '{}' ({}) from file {}",
                      sigobj.name(),
                      sigobj.md5sum()[:8], sigfile)
                raise

            this_n += 1
            total_loaded += 1
        if this_n:
            notify('loaded and merged {} signatures from {}...',
                   this_n,
                   sigfile,
                   end='\r')

    if not total_loaded:
        error("no signatures to merge!?")
        sys.exit(-1)

    merged_sigobj = sourmash.SourmashSignature(mh)

    output_json = sourmash.save_signatures([merged_sigobj], fp=args.output)

    notify('loaded and merged {} signatures', total_loaded)