Beispiel #1
0
def test_similarity_downsample(track_abundance):
    e = sourmash.MinHash(n=0,
                         ksize=20,
                         track_abundance=track_abundance,
                         max_hash=2**63)
    f = sourmash.MinHash(n=0,
                         ksize=20,
                         track_abundance=track_abundance,
                         max_hash=2**2)

    e.add_hash(1)
    e.add_hash(5)
    assert len(e.get_mins()) == 2

    f.add_hash(1)
    f.add_hash(5)  # should be discarded due to max_hash
    assert len(f.get_mins()) == 1

    ee = SourmashSignature(e)
    ff = SourmashSignature(f)

    with pytest.raises(ValueError):  # mismatch in max_hash
        ee.similarity(ff)

    x = ee.similarity(ff, downsample=True)
    assert round(x, 1) == 1.0
Beispiel #2
0
def test_compare_ne(track_abundance):
    # same content, different names -> different
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, name='bar')

    assert sig1 != sig2
Beispiel #3
0
def test_load_one_fail_multisig(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])

    with pytest.raises(ValueError):
        y = load_one_signature(x)
Beispiel #4
0
def test_compare(track_abundance):
    # same content, same name -> equal
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, name='foo')

    assert e == f
Beispiel #5
0
def test_compare_ne2_reverse(track_abundance):
    # same content, one has filename, other does not -> different
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, filename='b')

    assert sig2 != sig1
    assert sig1 != sig2
Beispiel #6
0
def test_memmap():

    e1 = sourmash.MinHash(n=1, ksize=20)
    sig1 = SourmashSignature(e1)

    e2 = sourmash.MinHash(n=1, ksize=25)
    sig2 = SourmashSignature(e2)
    siglist = [sig1, sig2]
    memmapped, filename = to_memmap(np.array(siglist))
    # Assert that the data didn't change as a result of memory-mapping
    np.testing.assert_array_equal(memmapped, siglist)
    assert filename.endswith(".mmap")
Beispiel #7
0
def test_save_minified(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1, name="foo")

    e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2, name="bar baz")

    x = save_signatures([sig1, sig2])
    assert '\n' not in x
    assert len(x.split('\n')) == 1

    y = list(load_signatures(x))
    assert len(y) == 2
    assert any(sig.name() == 'foo' for sig in y)
    assert any(sig.name() == 'bar baz' for sig in y)
Beispiel #8
0
def build_signature(p):
    header, seq = p
    mg_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100)
    mg_minhash.add_sequence(str(seq), force=True)
    mg_sig = sourmash.SourmashSignature(mg_minhash, name=header)

    return mg_sig
def test_save_load_multisig_json():
    e1 = sourmash.MinHash(n=1, ksize=20)
    sig1 = SourmashSignature(e1)

    e2 = sourmash.MinHash(n=1, ksize=25)
    sig2 = SourmashSignature(e2)

    x = save_signatures_json([sig1, sig2])
    y = list(load_signatures_json(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
def main(argv):
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('contigs')
    parser.add_argument('picklefile')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('--scaled', type=int, default=10000)
    args = parser.parse_args(argv)

    mh = sourmash.MinHash(0, args.ksize, scaled=args.scaled)
    hashval_to_contig_id = {}

    notify('reading contigs from {}', args.contigs)
    for record in screed.open(args.contigs):
        contig_id = int(record.name)

        this_mh = mh.copy_and_clear()
        this_mh.add_sequence(record.sequence, force=True)
        mins = this_mh.get_mins()

        for hashval in mins:
            hashval_to_contig_id[hashval] = contig_id

    notify('saving {} hashval -> cdbg_id mappings to {}',
           len(hashval_to_contig_id), args.picklefile)
    with open(args.picklefile, 'wb') as dumpfp:
        dump(hashval_to_contig_id, dumpfp)
Beispiel #11
0
def sig_import(args):
    """
    import a signature into sourmash format.
    """
    set_quiet(args.quiet)

    siglist = []
    for filename in args.filenames:
        with open(filename) as fp:
            x = json.loads(fp.read())

        ksize = x['kmer']
        num = x['sketchSize']

        assert x['hashType'] == "MurmurHash3_x64_128"
        assert x['hashBits'] == 64
        assert x['hashSeed'] == 42

        xx = x['sketches'][0]
        hashes = xx['hashes']

        mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False)
        mh.add_many(hashes)

        s = sourmash.SourmashSignature(mh, filename=filename)
        siglist.append(s)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(siglist, fp)
Beispiel #12
0
def test_save_load_multisig(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])
    y = list(load_signatures(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
Beispiel #13
0
def clustermap(prefix, outdir):
    """ Computes the pairwise comparison between kmers (k = 31) of missing regions and mapped regions using Jaccard similarity.
    Finally, generates a cluster map for those comparisons.
    
    Parameters
    ----------
    prefix: str
        Name of the genome
    outdir: str
        Output directory

    """
    logging.info("Running clustermap analysis with Sourmash")
    regions_fasta = [
        '{outdir}/{prefix}_unmappedregions.fasta'.format(outdir=outdir,
                                                         prefix=prefix),
        '{outdir}/{prefix}_mappedregions.fasta'.format(outdir=outdir,
                                                       prefix=prefix)
    ]
    minhashes = list()
    id_records = list()

    for r in regions_fasta:
        E = sourmash.MinHash(n=1000, ksize=31)
        for record in SeqIO.parse(r, format='fasta'):
            E.add_sequence(str(record.seq))
            if r == '{outdir}/{prefix}_unmappedregions.fasta'.format(
                    outdir=outdir, prefix=prefix):
                newid = ''.join([record.id, '_Um'])
                id_records.append(newid)
            else:
                newid = ''.join([record.id, '_M'])
                id_records.append(newid)
            minhashes.append(E)

    simil = dict()
    for i, e in enumerate(minhashes):
        jac = list()
        for j, e2 in enumerate(minhashes):
            x = e.jaccard(minhashes[j])
            jac.append(x)
        simil[id_records[i]] = jac

    array = {k: np.array(v) for k, v in simil.items()}
    X = pd.DataFrame.from_dict(array, orient='index')
    sour_dist = pd.DataFrame.from_dict(simil)
    sour_path = '{outdir}/kmer'.format(outdir=outdir)
    sour_dist.to_csv(os.path.join(
        sour_path, '{prefix}_sourmash_distances.tsv'.format(prefix=prefix)),
                     sep='\t',
                     index=False)

    plt.figure(figsize=(15, 10))
    sns.set(style='white', font_scale=1.2, palette='Spectral')
    ax = sns.clustermap(X)
    sns.despine()
    ax.savefig(os.path.join(sour_path, 'sourmash_clustermap.jpg'))
    plt.clf()
    logging.info("Clustermap analysis complete")
Beispiel #14
0
def get_target_sig(sample_name):
    genome = sample_name
    mh = sourmash.MinHash(n=1000, ksize=31)
    for record in screed.open(genome):
        mh.add_sequence(record.sequence, True)
    sig = SourmashSignature(mh, name=genome)
    with open(sample_name + '.sig', 'wt') as fp:
        save_signatures([sig], fp)
Beispiel #15
0
def test_binary_fp(tmpdir, track_abundance):
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)

    path = tmpdir.join("1.sig")
    with open(str(path), 'wb') as fp:
        sig = SourmashSignature(e)
        s = save_signatures([sig], fp)
Beispiel #16
0
def test_load_one_succeed(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    x = save_signatures([sig1])

    y = load_one_signature(x)
    assert sig1 == y
Beispiel #17
0
def test_hashable(track_abundance):
    # check: can we use signatures as keys in dictionaries and sets?
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)

    sig = SourmashSignature(e)

    x = set()
    x.add(sig)
Beispiel #18
0
def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig
Beispiel #19
0
def test_roundtrip(track_abundance):
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Beispiel #20
0
def test_load_compressed(track_abundance):
    e1 = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    x = save_signatures([sig1], compression=5)

    y = load_one_signature(x)
    assert sig1 == y

    sigfile = utils.get_test_data('genome-s10+s11.sig.gz')
    sigs = load_signatures(sigfile)
Beispiel #21
0
def test_roundtrip_empty(track_abundance):
    # edge case, but: empty minhash? :)
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0
Beispiel #22
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('lca_db')
    p.add_argument('genome', nargs='+')
    p.add_argument('output')
    p.add_argument('--fragment', default=100000, type=int)
    args = p.parse_args()

    db, ksize, scaled = lca_utils.load_single_database(args.lca_db)
    mh_factory = sourmash.MinHash(n=0, ksize=ksize, scaled=scaled)
    print('**', ksize, scaled)

    n = 0
    m = 0
    sum_bp = 0
    sum_missed_bp = 0

    outfp = open(args.output, 'wt')
    w = csv.writer(outfp)
    w.writerow(['filename', 'contig', 'begin', 'end', 'lca', 'lca_rank'])

    #
    # iterate over all contigs in genome file
    #
    for genome in args.genome:
        for record in screed.open(genome):
            # fragment longer contigs into smaller regions?
            for start in range(0, len(record.sequence), args.fragment):
                seq = record.sequence[start:start + args.fragment]
                n += 1
                sum_bp += len(seq)

                mh = mh_factory.copy_and_clear()
                mh.add_sequence(seq, force=True)
                if not mh:
                    sum_missed_bp += len(seq)
                    continue

                lineage_counts = summarize(mh.get_mins(), [db], 1)

                for k in lineage_counts:
                    lca = lca_utils.display_lineage(k, truncate_empty=False)
                    try:
                        lca_rank = k[-1].rank
                    except IndexError:
                        lca_rank = "none"
                    w.writerow((genome, record.name, start,
                                start + args.fragment, lca, lca_rank))

                m += 1
                min_value = min(mh.get_mins())

    return 0
Beispiel #23
0
def determine_appropriate_fresh_minhash(alphabet,
                                        ksize,
                                        scaled_val,
                                        ignore_abundance=False):
    # default behavior is to track abundance
    abund = not ignore_abundance
    if alphabet == "nucleotide":
        mh = sourmash.MinHash(ksize=ksize,
                              n=0,
                              scaled=scaled_val,
                              track_abundance=abund,
                              is_protein=False)
    elif alphabet == "protein":
        k = ksize * 3  ## need to multiply bt 3 to get same ksize, bc add_protein method does k/3
        mh = sourmash.MinHash(ksize=k,
                              n=0,
                              scaled=scaled_val,
                              track_abundance=abund,
                              is_protein=True,
                              dayhoff=False,
                              hp=False)
    elif alphabet == "dayhoff":
        k = ksize * 3
        mh = sourmash.MinHash(ksize=k,
                              n=0,
                              scaled=scaled_val,
                              track_abundance=abund,
                              is_protein=True,
                              dayhoff=True,
                              hp=False)
    elif alphabet == "hp":
        k = ksize * 3
        mh = sourmash.MinHash(ksize=k,
                              n=0,
                              scaled=scaled_val,
                              track_abundance=abund,
                              is_protein=True,
                              dayhoff=False,
                              hp=True)
    return mh
Beispiel #24
0
def test_str(track_abundance):
    # signatures should be printable
    e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)

    sig = SourmashSignature(e)

    print(sig)
    assert str(sig) == 'SourmashSignature(59502a74)'
    assert repr(sig) == 'SourmashSignature(59502a74)'

    sig._name = 'fizbar'
    assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
    assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
Beispiel #25
0
def test_roundtrip_max_hash(track_abundance):
    e = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance,
                             max_hash=10)
    e.add_hash(5)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert e.max_hash == e2.max_hash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Beispiel #26
0
def create_signatures(file_list, ksize=21, verbose=False):
    file_list = [Path(str(f) + '.sig') for f in file_list]
    gt = GenomeTools()
    if verbose:
        file_list = tqdm(file_list, total=len(file_list))
    for f in file_list:
        if f.is_file():
            sig = sourmash.load_one_signature(str(f))
            if sig.minhash.ksize == ksize:
                continue
        minhash = sourmash.MinHash(n=1000, ksize=ksize)
        genome = gt.read_fasta(f.with_suffix(''))
        minhash.add_sequence(genome, True)
        sig = sourmash.SourmashSignature(minhash, name=f.stem)
        with f.open('wt') as handle:
            sourmash.save_signatures([sig], handle)
Beispiel #27
0
def test_sourmash_scaled(datadir, ksize):
    import sourmash

    rfile = datadir('random-20-a.fa')
    goetia_sig = SourmashSketch.Sketch.build(0, 31, False, False, False, 42,
                                             1000)
    sourmash_sig = sourmash.MinHash(0, 31, scaled=1000)

    processor = SourmashSketch.Processor.build(goetia_sig)
    processor.process(rfile)

    for record in read_fastx(rfile):
        sourmash_sig.add_sequence(record.sequence)

    goetia_mh = goetia_sig.to_sourmash()

    assert goetia_mh.similarity(sourmash_sig) == 1.0
Beispiel #28
0
def sig_import(args):
    """
    import a signature into sourmash format.
    """
    p = SourmashArgumentParser(prog='sourmash signature import')
    p.add_argument('filenames', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    args = p.parse_args(args)
    set_quiet(args.quiet)

    siglist = []
    for filename in args.filenames:
        with open(filename) as fp:
            x = json.loads(fp.read())

        ksize = x['kmer']
        num = x['sketchSize']

        assert x['hashType'] == "MurmurHash3_x64_128"
        assert x['hashBits'] == 64
        assert x['hashSeed'] == 42

        xx = x['sketches'][0]
        hashes = xx['hashes']

        mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False)
        mh.add_many(hashes)

        s = sourmash.SourmashSignature(mh, filename=filename)
        siglist.append(s)

    sourmash.save_signatures(siglist, args.output)
Beispiel #29
0
def compare_sigs(sag_id, sag_file, mhr_path, sig_path, mg_sig_list,
                 jacc_threshold):
    sag_subcontigs = s_utils.get_seqs(sag_file)
    if isfile(o_join(mhr_path, sag_id + '.mhr_recruits.tsv')):
        logging.info('[SABer]: Loading %s and MetaG signature recruit list\n' %
                     sag_id)
        with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'),
                  'r') as mhr_in:
            pass_list = [
                x.rstrip('\n').split('\t') for x in mhr_in.readlines()
            ]
    else:
        # Calculate\Load MinHash Signatures with SourMash for SAG subseqs
        if isfile(o_join(sig_path, sag_id + '.SAG.sig')):
            logging.info('[SABer]: Loading Signature for %s\n' % sag_id)
            sag_sig = sourmash.signature.load_one_signature(
                o_join(sig_path, sag_id + '.SAG.sig'))
        else:
            logging.info('[SABer]: Building Signature for %s\n' % sag_id)
            sag_minhash = sourmash.MinHash(n=0, ksize=51, scaled=100)
            for sg_head in sag_subcontigs:
                sag_subseq = str(sag_subcontigs[sg_head].seq)
                sag_minhash.add_sequence(sag_subseq, force=True)
            sag_sig = sourmash.SourmashSignature(sag_minhash, name=sag_id)
            with open(o_join(sig_path, sag_id + '.SAG.sig'), 'w') as sags_out:
                sourmash.signature.save_signatures([sag_sig], fp=sags_out)
        logging.info('[SABer]: Comparing  %s and MetaG signature\n' % sag_id)
        pass_list = []
        for mg_sig in mg_sig_list:
            jacc_sim = mg_sig.similarity(sag_sig)
            mg_nm = mg_sig.name()
            if jacc_sim >= jacc_threshold:
                pass_list.append([sag_id, mg_nm, mg_nm.rsplit('_', 1)[0]])

        with open(o_join(mhr_path, sag_id + '.mhr_recruits.tsv'),
                  'w') as mhr_out:
            mhr_out.write('\n'.join(['\t'.join(x) for x in pass_list]))
    pass_list = tuple(pass_list)

    return pass_list

###

K = 21

import sys, screed
import mmh3
import sourmash
print('imported sourmash:', sourmash, file=sys.stderr)
from sourmash import MinHash
import sourmash.signature

record = next(iter(screed.open(sys.argv[1])))
print('loaded', record.name, file=sys.stderr)

mh = sourmash.MinHash(ksize=K, n=500, is_protein=True)
prot_ksize = int(K / 3)

for kmer in kmers(record.sequence, prot_ksize):
    hash = mmh3.hash64(kmer, seed=42)[0]

    # convert to unsigned int if negative
    if hash < 0:
        hash += 2**64

    mh.add_hash(hash)

s = sourmash.signature.SourmashSignature('', mh, name=record.name)
print(sourmash.signature.save_signatures([s]))