Esempi in Python per MinHash, esempi in Python per sourmash_lib.MinHash

Esempio n. 1

0

Mostra file

    def make_minhashes():
        seed = args.seed
        max_hash = 0
        if args.scaled and args.scaled > 1:
            max_hash = sourmash_lib.MAX_HASH / float(args.scaled)
            max_hash = int(round(max_hash, 0))

        # one minhash for each ksize
        Elist = []
        for k in ksizes:
            if args.protein:
                E = sourmash_lib.MinHash(ksize=k,
                                         n=args.num_hashes,
                                         is_protein=True,
                                         track_abundance=args.track_abundance,
                                         max_hash=max_hash,
                                         seed=seed)
                Elist.append(E)
            if args.dna:
                E = sourmash_lib.MinHash(ksize=k,
                                         n=args.num_hashes,
                                         is_protein=False,
                                         track_abundance=args.track_abundance,
                                         max_hash=max_hash,
                                         seed=seed)
                Elist.append(E)
        return Elist

Esempio n. 2

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_similarity_downsample(track_abundance):
    e = sourmash_lib.MinHash(n=0,
                             ksize=20,
                             track_abundance=track_abundance,
                             max_hash=2**63)
    f = sourmash_lib.MinHash(n=0,
                             ksize=20,
                             track_abundance=track_abundance,
                             max_hash=2**2)

    e.add_hash(1)
    e.add_hash(5)
    assert len(e.get_mins()) == 2

    f.add_hash(1)
    f.add_hash(5)  # should be discarded due to max_hash
    assert len(f.get_mins()) == 1

    ee = SourmashSignature(e)
    ff = SourmashSignature(f)

    with pytest.raises(ValueError):  # mismatch in max_hash
        ee.similarity(ff)

    x = ee.similarity(ff, downsample=True)
    assert round(x, 1) == 1.0

Esempio n. 3

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_load_one_fail_multisig(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])

    with pytest.raises(ValueError):
        y = load_one_signature(x)

Esempio n. 4

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_compare_ne(track_abundance):
    # same content, different names -> different
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, name='bar')

    assert sig1 != sig2

Esempio n. 5

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_compare(track_abundance):
    # same content, same name -> equal
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, name='foo')

    assert e == f

Esempio n. 6

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_compare_ne2_reverse(track_abundance):
    # same content, one has filename, other does not -> different
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig1 = SourmashSignature(e, name='foo')

    f = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    f.add("AT" * 10)
    sig2 = SourmashSignature(f, filename='b')

    assert sig2 != sig1
    assert sig1 != sig2

Esempio n. 7

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_save_minified(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1, name="foo")

    e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2, name="bar baz")

    x = save_signatures([sig1, sig2])
    assert '\n' not in x
    assert len(x.split('\n')) == 1

    y = list(load_signatures(x))
    assert len(y) == 2
    assert any(sig.name() == 'foo' for sig in y)
    assert any(sig.name() == 'bar baz' for sig in y)

Esempio n. 8

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_save_load_multisig(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    e2 = sourmash_lib.MinHash(n=1, ksize=25, track_abundance=track_abundance)
    sig2 = SourmashSignature(e2)

    x = save_signatures([sig1, sig2])
    y = list(load_signatures(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2

Esempio n. 9

0

Mostra file

def test_save_load_multisig_json():
    e1 = sourmash_lib.MinHash(n=1, ksize=20)
    sig1 = SourmashSignature('*****@*****.**', e1)

    e2 = sourmash_lib.MinHash(n=1, ksize=20)
    sig2 = SourmashSignature('*****@*****.**', e2)

    x = save_signatures_json([sig1, sig2])
    y = list(load_signatures_json(x))

    print(x)

    assert len(y) == 2
    assert sig1 in y  # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2

Esempio n. 10

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_load_one_succeed(track_abundance):
    e1 = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig1 = SourmashSignature(e1)

    x = save_signatures([sig1])

    y = load_one_signature(x)
    assert sig1 == y

Esempio n. 11

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_hashable(track_abundance):
    # check: can we use signatures as keys in dictionaries and sets?
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)

    sig = SourmashSignature(e)

    x = set()
    x.add(sig)

Esempio n. 12

0

Mostra file

File: test_api.py Progetto: pythseq/sourmash

def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig

Esempio n. 13

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_roundtrip(track_abundance):
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0

Esempio n. 14

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_roundtrip_empty(track_abundance):
    # edge case, but: empty minhash? :)
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0

Esempio n. 15

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_str(track_abundance):
    # signatures should be printable
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)

    sig = SourmashSignature(e)

    print(sig)
    assert str(sig) == 'SourmashSignature(59502a74)'
    assert repr(sig) == 'SourmashSignature(59502a74)'

    sig.d['name'] = 'fizbar'
    assert str(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'
    assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)'

Esempio n. 16

0

Mostra file

    def make_minhashes():
        seed = args.seed

        # one minhash for each ksize
        Elist = []
        for k in ksizes:
            if args.protein:
                E = sourmash_lib.MinHash(ksize=k,
                                         n=args.num_hashes,
                                         is_protein=True,
                                         track_abundance=args.track_abundance,
                                         scaled=args.scaled,
                                         seed=seed)
                Elist.append(E)
            if args.dna:
                E = sourmash_lib.MinHash(ksize=k,
                                         n=args.num_hashes,
                                         is_protein=False,
                                         track_abundance=args.track_abundance,
                                         scaled=args.scaled,
                                         seed=seed)
                Elist.append(E)
        return Elist

Esempio n. 17

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_roundtrip_max_hash(track_abundance):
    e = sourmash_lib.MinHash(n=0,
                             ksize=20,
                             track_abundance=track_abundance,
                             max_hash=10)
    e.add_hash(5)
    sig = SourmashSignature(e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.minhash

    assert e.max_hash == e2.max_hash

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0

Esempio n. 18

0

Mostra file

def import_csv(args):
    "Import a CSV file full of signatures/hashes."
    p = argparse.ArgumentParser()
    p.add_argument('mash_csvfile')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='(default: stdout)')
    p.add_argument('--email',
                   type=str,
                   default='',
                   help='(default: %(default)s)')
    args = p.parse_args(args)

    with open(args.mash_csvfile, 'r') as fp:
        reader = csv.reader(fp)
        siglist = []
        for row in reader:
            hashfn = row[0]
            hashseed = int(row[1])

            # only support a limited import type, for now ;)
            assert hashfn == 'murmur64'
            assert hashseed == 42

            _, _, ksize, name, hashes = row
            ksize = int(ksize)

            hashes = hashes.strip()
            hashes = list(map(int, hashes.split(' ')))

            e = sourmash_lib.MinHash(len(hashes), ksize)
            e.add_many(hashes)
            s = sig.SourmashSignature(args.email, e, filename=name)
            siglist.append(s)
            notify('loaded signature: {} {}', name, s.md5sum()[:8])

        notify('saving {} signatures to JSON', len(siglist))
        sig.save_signatures(siglist, args.output)

Esempio n. 19

0

Mostra file

def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('output')
    p.add_argument('--maxsize', type=float, default=20000)
    p.add_argument('--minsize', type=float, default=5000)
    p.add_argument('--min-abund', type=float, default=0)
    p.add_argument('-k',
                   '--ksize',
                   default=5,
                   type=int,
                   help='k-mer size for vectors')
    p.add_argument('--scaled', type=int, default=1000)
    args = p.parse_args(args)

    print('minsize: {:g}'.format(args.minsize))
    print('maxsize: {:g}'.format(args.maxsize))
    print('ksize: {}'.format(args.ksize))
    print('min_abund: {}'.format(args.min_abund))

    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    catlas = CAtlas(args.catlas_prefix,
                    load_sizefile=True,
                    min_abund=args.min_abund)
    catlas.decorate_with_shadow_sizes()

    # everything is loaded!

    # find highest nodes with kmer size less than given max_size
    print('finding terminal nodes for {}.'.format(args.maxsize))
    nodes = partition_catlas(catlas, args.maxsize)

    nodes = {n for n in nodes if catlas.kmer_sizes[n] > args.minsize}

    print('{} nodes between {} and {} in k-mer size'.format(
        len(nodes), args.minsize, args.maxsize))
    print('containing {} level1 nodes of {} total'.format(
        len(catlas.shadow(nodes)), sum(map(len,
                                           catlas.layer1_to_cdbg.values()))))

    node_kmers = sum([catlas.kmer_sizes[n] for n in nodes])
    total_kmers = catlas.kmer_sizes[catlas.root]
    print('containing {} kmers of {} total ({:.1f}%)'.format(
        node_kmers, total_kmers, node_kmers / total_kmers * 100))

    # now build cdbg -> subtree/group ID

    cdbg_to_group = {}
    for n in nodes:
        shadow = catlas.shadow([n])
        for cdbg_id in shadow:
            # TODO remove cdbg vertices with no kmers
            # for cdbg_id in catlas.layer1_to_cdbg[level1_node]:
            # if cdbg_id in catlas.kmer_sizes:
            assert cdbg_id not in cdbg_to_group
            cdbg_to_group[cdbg_id] = n

    # record group info - here we are using the MinHash class to track
    # k-mer abundances in group_info, as well as using group_ident to
    # to track k=31 MinHashes for identification of each group.
    group_info = {}
    group_ident = {}
    for n in nodes:
        group_info[n] = sourmash_lib.MinHash(n=0,
                                             ksize=args.ksize,
                                             scaled=1,
                                             track_abundance=1)
        group_ident[n] = sourmash_lib.MinHash(n=0,
                                              ksize=31,
                                              scaled=args.scaled)

    # aaaaaand iterate over contigs, collecting abundances from all contigs
    # in a group.
    for record_n, record in enumerate(screed.open(contigs)):
        if record_n % 10000 == 0:
            print('...', record_n, end='\r')
        cdbg_id = int(record.name)
        group_id = cdbg_to_group.get(cdbg_id)

        # if this is under a node that meets minsize criteria, track:
        if group_id is not None:
            # keep/measure abundances! @CTB are actually doing anything abund?
            mh = group_info[group_id]
            mh.add_sequence(record.sequence, True)

            # update group idents.
            group_ident[group_id].add_sequence(record.sequence, True)

    # ok, now we have a pile of k-mer vectors of size 4**args.ksize;
    # output in numpy format.
    compute_matrix(group_info, group_ident, args.ksize, args.output)

Esempio n. 20

0

Mostra file

File: extract_nodes_by_shadow_ratio.py Progetto: drtamermansour/spacegraphcats

def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('output')
    p.add_argument('--minsize', type=float, default=100)
    p.add_argument('--maxsize', type=float, default=10000)
    p.add_argument('--keep-fraction', type=float, default=0.1)
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    args = p.parse_args(args)

    print('minsize: {:g}'.format(args.minsize))
    print('maxsize: {:g}'.format(args.maxsize))

    basename = os.path.basename(args.catlas_prefix)
    catlas = os.path.join(args.catlas_prefix, 'catlas.csv')
    domfile = os.path.join(args.catlas_prefix, 'first_doms.txt')

    # load catlas DAG
    top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(
        catlas)
    print('loaded {} nodes from catlas {}'.format(len(dag), catlas))

    # load mapping between dom nodes and cDBG/graph nodes:
    layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile)
    print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg)))

    # calculate the cDBG shadow sizes for each catlas node.
    print('decorating catlas with shadow size info.')
    node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes(
        layer1_to_cdbg, dag, dag_levels)

    # ...and load cdbg node sizes
    print('loading contig size info')
    cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(
        args.catlas_prefix)

    # decorate catlas with cdbg node sizes underneath them
    print('decorating catlas with contig size info.')
    node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(
        layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes,
        cdbg_weighted_kmer_sizes)

    ### ok, the real work: look at articulation of cDBG graph.

    # find highest nodes with kmer size less than given max_size
    def find_terminal_nodes(node_id, max_size):
        node_list = set()
        for sub_id in dag[node_id]:
            # shadow size
            size = node_kmer_sizes[sub_id]

            if size < max_size:
                node_list.add(sub_id)
            else:
                children = find_terminal_nodes(sub_id, max_size)
                node_list.update(children)

        return node_list

    print('finding terminal nodes for {}.'.format(args.maxsize))

    terminal = find_terminal_nodes(top_node_id, args.maxsize)
    print('...got {}'.format(len(terminal)))
    terminal = {n for n in terminal if node_kmer_sizes[n] > args.minsize}
    print('...down to {} between {} and {} in size.'.format(
        len(terminal), args.minsize, args.maxsize))

    # now, go through and calculate ratios
    x = []
    for node_id in terminal:
        # calculate: how many k-mers per cDBG node?
        kmer_size = node_kmer_sizes[node_id]
        shadow_size = node_shadow_sizes[node_id]

        ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2)

        # track basic info
        x.append((ratio, node_id, shadow_size, kmer_size))

    print('terminal node stats for maxsize: {:g}'.format(args.maxsize))
    print('n tnodes:', len(terminal))
    print('total k-mers:', node_kmer_sizes[top_node_id])

    x.sort(reverse=True)
    for (k, v, a, b) in x[:10]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)
    print('... eliding {} nodes'.format(len(x) - 20))
    for (k, v, a, b) in x[-10:]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)

    # keep the last keep-fraction (default 10%) for examination
    keep_sum_kmer = args.keep_fraction * node_kmer_sizes[top_node_id]
    sofar = 0
    keep_terminal = set()
    for (k, v, a, b) in reversed(x):
        sofar += b
        if sofar > keep_sum_kmer:
            break
        keep_terminal.add(v)

    print(
        'keeping last {} k-mers worth of nodes for examination.'.format(sofar))

    # build cDBG shadow ID list.
    cdbg_shadow = set()
    terminal_shadow = find_shadow(keep_terminal, dag)
    for x in terminal_shadow:
        cdbg_shadow.update(layer1_to_cdbg.get(x))

    #### extract contigs
    print('extracting contigs & building a sourmash signature')
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # track results as signature
    contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000)

    total_bp = 0
    total_seqs = 0

    outfp = open(args.output, 'wt')
    for n, record in enumerate(screed.open(contigs)):
        if n and n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_shadow)
            print('...at n {} ({:.1f}% of shadow)'.format(
                total_seqs, offset_f * 100),
                  end='\r')

        # contig names == cDBG IDs
        contig_id = int(record.name)
        if contig_id not in cdbg_shadow:
            continue

        outfp.write('>{}\n{}\n'.format(record.name, record.sequence))
        contigs_mh.add_sequence(record.sequence)

        # track retrieved sequences in a minhash
        total_bp += len(record.sequence)
        total_seqs += 1

    # done - got all contigs!
    print('')
    print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp))

    print('wrote contigs to {}'.format(args.output))
    with open(args.output + '.sig', 'wt') as fp:
        ss = sourmash_lib.SourmashSignature(contigs_mh)
        sourmash_lib.save_signatures([ss], fp)

Esempio n. 21

0

Mostra file

###

K = 21

import sys, screed
import mmh3
import sourmash_lib
print('imported sourmash:', sourmash_lib, file=sys.stderr)
from sourmash_lib import MinHash
import sourmash_lib.signature

record = next(iter(screed.open(sys.argv[1])))
print('loaded', record.name, file=sys.stderr)
revcomp = reverse(complement((record.sequence)))

mh = sourmash_lib.MinHash(ksize=K, n=500, is_protein=False)

#
# compute the actual hashes to insert by breaking down the sequence
# into k-mers and applying MurmurHash to each one; here, the only
# interesting thing that is done by add_hash is to keep only the
# (numerically) lowest n=500 hashes.
#
# this method of hash computation is exactly how sourmash does it
# internally, and should be approximately the same as what mash does.
#

for fwd_kmer in kmers(record.sequence, K):
    rev_kmer = reverse(complement(fwd_kmer))
    if fwd_kmer < rev_kmer:
        kmer = fwd_kmer

Esempio n. 22

0

Mostra file

def _json_next_signature(iterable,
                         name=None,
                         filename=None,
                         ignore_md5sum=False,
                         prefix_item='abundances.item',
                         ijson=ijson):
    """Helper function to unpack and check one signature block only.
    - iterable: an iterable such the one returned by ijson.parse()
    - name:
    - filename:
    - ignore_md5sum:
    - prefix_item: required when parsing nested JSON structures
    - ijson: ijson backend to use.
    """
    from .signature import SourmashSignature

    d = dict()
    prefix, event, value = next(iterable)
    if event == 'start_map':
        prefix, event, value = next(iterable)
    while event != 'end_map':
        key = value
        if key == 'mins':
            value = _json_next_atomic_array(iterable,
                                            prefix_item=prefix_item,
                                            ijson=ijson)
        elif key == 'abundances':
            value = _json_next_atomic_array(iterable,
                                            prefix_item=prefix_item,
                                            ijson=ijson)
        else:
            prefix, event, value = next(iterable)
        d[key] = value
        prefix, event, value = next(iterable)

    ksize = d['ksize']
    mins = d['mins']
    n = d['num']
    if n == 0xffffffff:  # load legacy signatures where n == -1
        n = 0
    max_hash = d.get('max_hash', 0)
    seed = d.get('seed', sourmash_lib.DEFAULT_SEED)

    molecule = d.get('molecule', 'DNA')
    if molecule == 'protein':
        is_protein = True
    elif molecule.upper() == 'DNA':
        is_protein = False
    else:
        raise Exception("unknown molecule type: {}".format(molecule))

    track_abundance = False
    if 'abundances' in d:
        track_abundance = True

    e = sourmash_lib.MinHash(ksize=ksize,
                             n=n,
                             is_protein=is_protein,
                             track_abundance=track_abundance,
                             max_hash=max_hash,
                             seed=seed)

    if not track_abundance:
        for m in mins:
            e.add_hash(m)
    else:
        abundances = list(map(int, d['abundances']))
        e.set_abundances(dict(zip(mins, abundances)))

    sig = SourmashSignature(e)

    if not ignore_md5sum:
        md5sum = d['md5sum']
        if md5sum != sig.md5sum():
            raise Exception('error loading - md5 of minhash does not match')

    if name:
        sig.d['name'] = name
    if filename:
        sig.d['filename'] = filename

    return sig

Esempio n. 23

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_name_3(track_abundance):
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig = SourmashSignature(e, name='foo', filename='foo.txt')
    assert sig.name() == 'foo'

Esempio n. 24

0

Mostra file

File: extract_unassembled_nodes.py Progetto: ssarria/spacegraphcats

def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('query')
    p.add_argument('output')
    p.add_argument('--threshold', default=0.0, type=float)
    p.add_argument('--minsize', default=0, type=int)
    p.add_argument('-k', '--ksize', default=31, type=int,
                   help='k-mer size (default: 31)')
    args = p.parse_args(args)

    print('threshold: {:.3f}'.format(args.threshold))

    basename = os.path.basename(args.catlas_prefix)
    catlas = os.path.join(args.catlas_prefix, 'catlas.csv')
    domfile = os.path.join(args.catlas_prefix, 'first_doms.txt')

    # load catlas DAG
    top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(catlas)
    print('loaded {} nodes from catlas {}'.format(len(dag), catlas))

    # load mapping between dom nodes and cDBG/graph nodes:
    layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile)
    print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg)))

    # calculate the cDBG shadow sizes for each catlas node.
    print('decorating catlas with shadow size info.')
    node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes(layer1_to_cdbg, dag, dag_levels)

    # ...and load cdbg node sizes
    print('loading contig size info')
    cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(args.catlas_prefix)

    # decorate catlas with cdbg node sizes underneath them
    print('decorating catlas with contig size info.')
    node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes)

    # load k-mer index, query, etc. etc.
    kmer_idx = search_utils.load_kmer_index(args.catlas_prefix)

    bf = khmer.Nodetable(args.ksize, 1, 1)

    query_kmers = set()
    for record in screed.open(args.query):
        query_kmers.update(bf.get_kmer_hashes(record.sequence))

    print('got {} k-mers from {}'.format(len(query_kmers), args.query))

    # construct dict cdbg_id -> # of query k-mers
    cdbg_match_counts = kmer_idx.get_match_counts(query_kmers)

    total_match_kmers = sum(cdbg_match_counts.values())
    f_found = total_match_kmers / len(query_kmers)
    print('=> containment: {:.1f}%'.format(f_found * 100))
    print('done loading & counting query k-mers in cDBG.')

    if total_match_kmers == 0:
        print('no match k-mers!?')
        sys.exit(-1)

    # calculate the cDBG matching k-mers sizes for each catlas node.
    catlas_match_counts = kmer_idx.build_catlas_match_counts(cdbg_match_counts, dag, dag_levels, layer1_to_cdbg)

    ### ok, the real work: find nodes that have low # of k-mers in the query.
    def find_unassembled_nodes(node_id, threshold=0.0):
        node_list = set()
        for sub_id in dag[node_id]:
            n_matched = catlas_match_counts.get(sub_id, 0)
            size = node_kmer_sizes[sub_id]

            f_assembled = n_matched / size

            # if the fraction of unassembled k-mers under this node is below
            # our threshold, KEEP the node. Otherwise, descend into children.
            if f_assembled <= threshold:
                node_list.add(sub_id)
            else:
                children = find_unassembled_nodes(sub_id, threshold)
                node_list.update(children)

        return node_list

    print('finding unassembled nodes for threshold {}.'.format(args.threshold))

    terminal = find_unassembled_nodes(top_node_id, args.threshold)
    sum_kmers = sum([ node_kmer_sizes[n] for n in terminal ])
    sum_match_kmers = sum([ catlas_match_counts.get(n, 0) for n in terminal ])
    print('...got {} nodes, representing {} k-mers'.format(len(terminal), sum_kmers))

    # now, go through all nodes and print out characteristics
    print('writing node info to {}'.format(args.output + '.csv'))
    with open(args.output + '.csv', 'wt') as fp:
        w = csv.writer(fp)

        w.writerow(['node_id', 'contained', 'n_kmers', 'n_weighted_kmers', 'average_weight','shadow_size'])
        for n in terminal:
            f_contained = catlas_match_counts.get(n, 0) / node_kmer_sizes[n]
            w.writerow([n,
                        '{:.3f}'.format(f_contained),
                        node_kmer_sizes[n],
                        '{:.1f}'.format(node_weighted_kmer_sizes[n]),
                        '{:.2f}'.format(node_weighted_kmer_sizes[n] / node_kmer_sizes[n]),
                        node_shadow_sizes[n]])

    if args.minsize:
        print('minsize set: {}. filtering.'.format(args.minsize))
        new_terminal = set()
        for n in terminal:
            if node_kmer_sizes[n] >= args.minsize:
                new_terminal.add(n)

        print('removed {} nodes => {}'.format(len(terminal)-len(new_terminal),
                                              len(new_terminal)))
        terminal = new_terminal

    # build cDBG shadow ID list, tagged by parent catlas node.
    cdbg_id_to_node = {}
    for n in terminal:
        this_shadow = find_shadow([n], dag)
        for x in this_shadow:
            v = layer1_to_cdbg[x]
            for vv in v:
                cdbg_id_to_node[vv] = n

    #### extract contigs
    print('extracting contigs & building a sourmash signature')
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # track results as signature
    contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000)

    total_bp = 0
    total_seqs = 0

    print('writing contigs to {}'.format(args.output + '.fa'))
    outfp = open(args.output + '.fa', 'wt')
    for n, record in enumerate(screed.open(contigs)):
        if n and n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_id_to_node)
            print('...at n {} ({:.1f}% of shadow)'.format(total_seqs,
                  offset_f * 100),
                  end='\r')

        # contig names == cDBG IDs
        contig_id = int(record.name)
        catlas_parent = cdbg_id_to_node.get(contig_id)
        if catlas_parent is None:
            continue

        outfp.write('>{} {}\n{}\n'.format(record.name, catlas_parent, record.sequence))
        contigs_mh.add_sequence(record.sequence)

        # track retrieved sequences in a minhash
        total_bp += len(record.sequence)
        total_seqs += 1

    # done - got all contigs!
    print('')
    print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp))

    print('writing sig to {}'.format(args.output + '.sig'))
    with open(args.output + '.sig', 'wt') as fp:
        ss = sourmash_lib.SourmashSignature(contigs_mh)
        sourmash_lib.save_signatures([ss], fp)

Esempio n. 25

0

Mostra file

 def build_new_signature(mins):
     e = sourmash_lib.MinHash(ksize=query_ksize, n=len(mins))
     e.add_many(mins)
     return sig.SourmashSignature('', e)

Esempio n. 26

0

Mostra file

def gather(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash

    parser = argparse.ArgumentParser()
    parser.add_argument('query', help='query signature')
    parser.add_argument('databases',
                        help='signatures/SBTs to search',
                        nargs='+')
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('wt'),
                        help='output CSV containing matches to this file')
    parser.add_argument(
        '--save-matches',
        type=argparse.FileType('wt'),
        help='save the matched signatures from the database to this file.')
    parser.add_argument('--threshold-bp',
                        type=float,
                        default=5e4,
                        help='threshold (in bp) for reporting results')
    parser.add_argument(
        '--output-unassigned',
        type=argparse.FileType('wt'),
        help=
        'output unassigned portions of the query as a signature to this file')
    parser.add_argument('--scaled',
                        type=float,
                        help='downsample query to this scaled factor')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')

    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    # load the query signature & figure out all the things
    query = sourmash_args.load_query_signature(args.query,
                                               select_ksize=args.ksize,
                                               select_moltype=moltype)
    query_moltype = sourmash_args.get_moltype(query)
    query_ksize = query.minhash.ksize
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query_ksize, query_moltype)

    # verify signature was computed right.
    if query.minhash.max_hash == 0:
        error('query signature needs to be created with --scaled')
        sys.exit(-1)

    # downsample if requested
    if args.scaled:
        notify('downsampling query from scaled={} to {}', query.minhash.scaled,
               int(args.scaled))
        query.minhash = query.minhash.downsample_scaled(args.scaled)

    # empty?
    if not query.minhash.get_mins():
        error('no query hashes!? exiting.')
        sys.exit(-1)

    # set up the search databases
    databases = sourmash_args.load_sbts_and_sigs(args.databases, query_ksize,
                                                 query_moltype)

    if not len(databases):
        error('Nothing found to search!')
        sys.exit(-1)

    orig_query = query
    orig_mins = orig_query.minhash.get_hashes()

    # calculate the band size/resolution R for the genome
    R_metagenome = sourmash_lib.MAX_HASH / float(orig_query.minhash.max_hash)

    # define a function to do a 'best' search and get only top match.
    def find_best(dblist, query):
        results = []
        for (sbt_or_siglist, filename, is_sbt) in dblist:
            search_fn = SearchMinHashesFindBestIgnoreMaxHash().search

            if is_sbt:
                tree = sbt_or_siglist

                for leaf in tree.find(search_fn, query, 0.0):
                    leaf_e = leaf.data.minhash
                    similarity = query.minhash.similarity_ignore_maxhash(
                        leaf_e)
                    if similarity > 0.0:
                        results.append((similarity, leaf.data))
            else:
                for ss in sbt_or_siglist:
                    similarity = query.minhash.similarity_ignore_maxhash(
                        ss.minhash)
                    if similarity > 0.0:
                        results.append((similarity, ss))

        if not results:
            return None, None, None

        # take the best result
        results.sort(key=lambda x: -x[0])  # reverse sort on similarity
        best_similarity, best_leaf = results[0]
        return best_similarity, best_leaf, filename

    # define a function to build new signature object from set of mins
    def build_new_signature(mins):
        e = sourmash_lib.MinHash(ksize=query_ksize, n=len(mins))
        e.add_many(mins)
        return sig.SourmashSignature('', e)

    # xxx
    def format_bp(bp):
        bp = float(bp)
        if bp < 500:
            return '{:.0f} bp '.format(bp)
        elif bp <= 500e3:
            return '{:.1f} kbp'.format(round(bp / 1e3, 1))
        elif bp < 500e6:
            return '{:.1f} Mbp'.format(round(bp / 1e6, 1))
        elif bp < 500e9:
            return '{:.1f} Gbp'.format(round(bp / 1e9, 1))
        return '???'

    # construct a new query that doesn't have the max_hash attribute set.
    new_mins = query.minhash.get_hashes()
    query = build_new_signature(new_mins)

    sum_found = 0.
    found = []
    GatherResult = namedtuple(
        'GatherResult',
        'intersect_bp, f_orig_query, f_match, f_unique_to_query, filename, name, md5, leaf'
    )
    while 1:
        best_similarity, best_leaf, filename = find_best(databases, query)
        if not best_leaf:  # no matches at all!
            break

        # subtract found hashes from search hashes, construct new search
        query_mins = set(query.minhash.get_hashes())
        found_mins = best_leaf.minhash.get_hashes()

        # figure out what the resolution of the banding on the genome is,
        # based either on an explicit --scaled parameter, or on genome
        # cardinality (deprecated)
        if not best_leaf.minhash.max_hash:
            error('Best hash match in sbt_gather has no max_hash')
            error('Please prepare database of sequences with --scaled')
            sys.exit(-1)

        R_genome = best_leaf.minhash.scaled

        # pick the highest R / lowest resolution
        R_comparison = max(R_metagenome, R_genome)

        # CTB: these could probably be replaced by minhash.downsample_scaled.
        new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison)
        query_mins = set([i for i in query_mins if i < new_max_hash])
        found_mins = set([i for i in found_mins if i < new_max_hash])
        orig_mins = set([i for i in orig_mins if i < new_max_hash])

        # calculate intersection:
        intersect_mins = query_mins.intersection(found_mins)
        intersect_orig_mins = orig_mins.intersection(found_mins)
        intersect_bp = R_comparison * len(intersect_orig_mins)
        sum_found += len(intersect_mins)

        if intersect_bp < args.threshold_bp:  # hard cutoff for now
            notify('found less than {} in common. => exiting',
                   format_bp(intersect_bp))
            break

        # calculate fractions wrt first denominator - genome size
        genome_n_mins = len(found_mins)
        f_match = len(intersect_mins) / float(genome_n_mins)
        f_orig_query = len(intersect_orig_mins) / float(len(orig_mins))

        # calculate fractions wrt second denominator - metagenome size
        query_n_mins = len(orig_query.minhash.get_hashes())
        f_unique_to_query = len(intersect_mins) / float(query_n_mins)

        if not len(found):  # first result? print header.
            print_results("")
            print_results("overlap     p_query p_match ")
            print_results("---------   ------- --------")

        result = GatherResult(intersect_bp=intersect_bp,
                              f_orig_query=f_orig_query,
                              f_match=f_match,
                              f_unique_to_query=f_unique_to_query,
                              filename=filename,
                              md5=best_leaf.md5sum(),
                              name=best_leaf.name(),
                              leaf=best_leaf)

        # print interim result & save in a list for later use
        pct_query = '{:.1f}%'.format(result.f_orig_query * 100)
        pct_genome = '{:.1f}%'.format(result.f_match * 100)

        name = result.leaf._display_name(40)

        print_results('{:9}   {:>6}  {:>6}      {}',
                      format_bp(result.intersect_bp), pct_query, pct_genome,
                      name)
        found.append(result)

        # construct a new query, minus the previous one.
        query_mins -= set(found_mins)
        query = build_new_signature(query_mins)

    # basic reporting
    print_results('\nfound {} matches total;', len(found))

    sum_found /= len(orig_query.minhash.get_hashes())
    print_results('the recovered matches hit {:.1f}% of the query',
                  sum_found * 100)
    print_results('')

    if not found:
        sys.exit(0)

    if args.output:
        fieldnames = [
            'intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query',
            'name', 'filename', 'md5'
        ]
        w = csv.DictWriter(args.output, fieldnames=fieldnames)
        w.writeheader()
        for result in found:
            d = dict(result._asdict())
            del d['leaf']  # actual signature not in CSV.
            w.writerow(d)

    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matches to "{}"', outname)
        sig.save_signatures([r.leaf for r in found], args.save_matches)

    if args.output_unassigned:
        if not found:
            notify('nothing found - entire query signature unassigned.')
        if not query.minhash.get_mins():
            notify('no unassigned hashes! not saving.')
        else:
            outname = args.output_unassigned.name
            notify('saving unassigned hashes to "{}"', outname)

            e = sourmash_lib.MinHash(ksize=query_ksize,
                                     n=0,
                                     max_hash=new_max_hash)
            e.add_many(query.minhash.get_mins())
            sig.save_signatures([sig.SourmashSignature('', e)],
                                args.output_unassigned)

Esempio n. 27

0

Mostra file

File: test_signature.py Progetto: pythseq/sourmash

def test_name_4(track_abundance):
    e = sourmash_lib.MinHash(n=1, ksize=20, track_abundance=track_abundance)
    sig = SourmashSignature(e)
    assert sig.name() == sig.md5sum()[:8]

Esempio n. 28

0

Mostra file

def watch(args):
    "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to search')
    parser.add_argument('inp_file', nargs='?', default='/dev/stdin')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('wt'),
                        help='save signature generated from data here')
    parser.add_argument('--threshold',
                        default=0.05,
                        type=float,
                        help='minimum threshold for matches')
    parser.add_argument(
        '--input-is-protein',
        action='store_true',
        help='Consume protein sequences - no translation needed')
    sourmash_args.add_construct_moltype_args(parser)
    parser.add_argument(
        '-n',
        '--num-hashes',
        type=int,
        default=DEFAULT_N,
        help='number of hashes to use in each sketch (default: %(default)i)')
    parser.add_argument('--name',
                        type=str,
                        default='stdin',
                        help='name to use for generated signature')
    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
    args = parser.parse_args(args)
    set_quiet(args.quiet)

    if args.input_is_protein and args.dna:
        notify('WARNING: input is protein, turning off DNA hashing.')
        args.dna = False
        args.protein = True

    if args.dna and args.protein:
        notify('ERROR: cannot use "watch" with both DNA and protein.')

    if args.dna:
        moltype = 'DNA'
        is_protein = False
    else:
        moltype = 'protein'
        is_protein = True

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

    def get_ksize(tree):
        """Walk nodes in `tree` to find out ksize"""
        for node in tree.nodes.values():
            if isinstance(node, sourmash_lib.sbtmh.SigLeaf):
                return node.data.minhash.ksize

    # deduce ksize from the SBT we are loading
    ksize = args.ksize
    if ksize is None:
        ksize = get_ksize(tree)

    E = sourmash_lib.MinHash(ksize=ksize,
                             n=args.num_hashes,
                             is_protein=is_protein)
    streamsig = sig.SourmashSignature('', E, filename='stdin', name=args.name)

    notify('Computing signature for k={}, {} from stdin', ksize, moltype)

    def do_search():
        search_fn = SearchMinHashesFindBest().search

        results = []
        for leaf in tree.find(search_fn, streamsig, args.threshold):
            results.append((streamsig.similarity(leaf.data), leaf.data))

        return results

    notify('reading sequences from stdin')
    screed_iter = screed.open(args.inp_file)
    watermark = WATERMARK_SIZE

    # iterate over input records
    n = 0
    for n, record in enumerate(screed_iter):
        # at each watermark, print status & check cardinality
        if n >= watermark:
            notify('\r... read {} sequences', n, end='')
            watermark += WATERMARK_SIZE

            if do_search():
                break

        if args.input_is_protein:
            E.add_protein(record.sequence)
        else:
            E.add_sequence(record.sequence, False)

    results = do_search()
    if not results:
        notify('... read {} sequences, no matches found.', n)
    else:
        results.sort(key=lambda x: -x[0])  # take best
        similarity, found_sig = results[0]
        print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity)

    if args.output:
        notify('saving signature to {}', args.output.name)
        sig.save_signatures([streamsig], args.output)

Esempio n. 29

0

Mostra file

def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('output')
    p.add_argument('--maxsize', type=float, default=10000)
    p.add_argument('--minsize', type=float, default=2000)
    p.add_argument('--min-abund', type=float, default=0)
    p.add_argument('-k',
                   '--ksize',
                   default=5,
                   type=int,
                   help='k-mer size for vectors')
    p.add_argument('--scaled', type=int, default=1000)
    args = p.parse_args(args)

    print('minsize: {:g}'.format(args.minsize))
    print('maxsize: {:g}'.format(args.maxsize))
    print('ksize: {}'.format(args.ksize))
    print('min_abund: {}'.format(args.min_abund))

    basename = os.path.basename(args.catlas_prefix)
    catlas = os.path.join(args.catlas_prefix, 'catlas.csv')
    domfile = os.path.join(args.catlas_prefix, 'first_doms.txt')

    # load catlas DAG
    top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(
        catlas)
    print('loaded {} nodes from catlas {}'.format(len(dag), catlas))

    # load mapping between dom nodes and cDBG/graph nodes:
    layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile)
    print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg)))

    # find the contigs filename
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # ...and catlas node sizes
    print('loading contig size info')
    cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(
        args.catlas_prefix, min_abund=args.min_abund)
    node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(
        layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes,
        cdbg_weighted_kmer_sizes)

    ### everything is loaded!

    # find highest nodes with kmer size less than given max_size
    print('finding terminal nodes for {}.'.format(args.maxsize))
    nodes = partition_catlas(dag, top_node_id, node_kmer_sizes, args.maxsize)

    nodes = {n for n in nodes if node_kmer_sizes[n] > args.minsize}

    print('{} nodes between {} and {} in k-mer size'.format(
        len(nodes), args.minsize, args.maxsize))
    print('containing {} level1 nodes of {} total'.format(
        len(find_shadow(nodes, dag)), len(layer1_to_cdbg)))

    node_kmers = sum([node_kmer_sizes[n] for n in nodes])
    print('containing {} kmers of {} total ({:.1f}%)'.format(
        node_kmers, node_kmer_sizes[top_node_id],
        node_kmers / node_kmer_sizes[top_node_id] * 100))

    ### now build cdbg -> subtree/group ID

    cdbg_to_group = {}
    for n in nodes:
        shadow = find_shadow([n], dag)
        for level1_node in shadow:
            for cdbg_id in layer1_to_cdbg[level1_node]:
                if cdbg_kmer_sizes.get(cdbg_id):
                    assert cdbg_id not in cdbg_to_group
                    cdbg_to_group[cdbg_id] = n

    # record group info - here we are using the MinHash class to track
    # k-mer abundances in group_info, as well as using group_ident to
    # to track k=31 MinHashes for identification of each group.
    group_info = {}
    group_ident = {}
    for n in nodes:
        group_info[n] = sourmash_lib.MinHash(n=0,
                                             ksize=args.ksize,
                                             scaled=1,
                                             track_abundance=1)
        group_ident[n] = sourmash_lib.MinHash(n=0,
                                              ksize=31,
                                              scaled=args.scaled)

    # aaaaaand iterate over contigs, collecting abundances from all contigs
    # in a group.
    for record_n, record in enumerate(screed.open(contigs)):
        if record_n % 10000 == 0:
            print('...', record_n, end='\r')
        cdbg_id = int(record.name)
        group_id = cdbg_to_group.get(cdbg_id)

        # if this is under a node that meets minsize criteria, track:
        if group_id is not None:
            # keep/measure abundances!
            mh = group_info[group_id]
            mh.add_sequence(record.sequence, True)

            # update group idents.
            group_ident[group_id].add_sequence(record.sequence, True)

    # ok, now we have a pile of k-mer vectors of size 4**args.ksize;
    # output in numpy format.

    # first, make a consistently ordered list of all k-mers, and convert
    # them into hashes.
    all_kmers = make_all(args.ksize)
    all_kmer_hashes = list(set([hash_murmur(i) for i in all_kmers]))
    all_kmer_hashes.sort()

    # now, build a matrix of GROUP_N rows x 4**ksize columns, where each
    # row will be the set of k-mer abundances associated with each group.
    print('creating', len(group_info), 4**args.ksize)
    V = numpy.zeros((len(group_info), 4**args.ksize), dtype=numpy.uint16)
    node_id_to_group_idx = {}
    for i, n in enumerate(group_info):
        if i % 1000 == 0:
            print('...', i, len(group_info))
        mh = group_info[n]
        vec = dict(mh.get_mins(with_abundance=True))
        vec = [vec.get(hashval, 0) for hashval in all_kmer_hashes]
        vec = numpy.array(vec)
        V[i] = vec

        node_id_to_group_idx[n] = i

    # save!
    print('saving matrix of size {} to {}'.format(str(V.shape), args.output))
    with open(args.output, 'wb') as fp:
        numpy.save(fp, V)

    with open(args.output + '.node_ids', 'wb') as fp:
        pickle.dump(node_id_to_group_idx, fp)

    with open(args.output + '.node_mh', 'wb') as fp:
        pickle.dump(group_ident, fp)

Esempio n. 30

0

Mostra file

def gather(args):
    from .search import gather_databases, format_bp

    parser = argparse.ArgumentParser()
    parser.add_argument('query', help='query signature')
    parser.add_argument('databases',
                        help='signatures/SBTs to search',
                        nargs='+')
    parser.add_argument('--traverse-directory',
                        action='store_true',
                        help='search all signatures underneath directories.')
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('wt'),
                        help='output CSV containing matches to this file')
    parser.add_argument(
        '--save-matches',
        type=argparse.FileType('wt'),
        help='save the matched signatures from the database to this file.')
    parser.add_argument('--threshold-bp',
                        type=float,
                        default=5e4,
                        help='threshold (in bp) for reporting results')
    parser.add_argument(
        '--output-unassigned',
        type=argparse.FileType('wt'),
        help=
        'output unassigned portions of the query as a signature to this file')
    parser.add_argument('--scaled',
                        type=float,
                        default=0,
                        help='downsample query to this scaled factor')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('--ignore-abundance',
                        action='store_true',
                        help='do NOT use k-mer abundances if present')
    parser.add_argument('-d', '--debug', action='store_true')

    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    # load the query signature & figure out all the things
    query = sourmash_args.load_query_signature(args.query,
                                               ksize=args.ksize,
                                               select_moltype=moltype)
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query.minhash.ksize,
           sourmash_args.get_moltype(query))

    # verify signature was computed right.
    if query.minhash.max_hash == 0:
        error('query signature needs to be created with --scaled')
        sys.exit(-1)

    # downsample if requested
    if args.scaled:
        notify('downsampling query from scaled={} to {}', query.minhash.scaled,
               int(args.scaled))
        query.minhash = query.minhash.downsample_scaled(args.scaled)

    # empty?
    if not query.minhash.get_mins():
        error('no query hashes!? exiting.')
        sys.exit(-1)

    # set up the search databases
    databases = sourmash_args.load_sbts_and_sigs(args.databases, query, False,
                                                 args.traverse_directory)

    if not len(databases):
        error('Nothing found to search!')
        sys.exit(-1)

    found = []
    for result, weighted_missed, new_max_hash, next_query in gather_databases(
            query, databases, args.threshold_bp, args.ignore_abundance):
        # print interim result & save in a list for later use
        pct_query = '{:.1f}%'.format(result.f_orig_query * 100)
        pct_genome = '{:.1f}%'.format(result.f_match * 100)

        name = result.leaf._display_name(40)

        if not len(found):  # first result? print header.
            print_results("")
            print_results("overlap     p_query p_match ")
            print_results("---------   ------- --------")

        # print interim result & save in a list for later use
        pct_query = '{:.1f}%'.format(result.f_unique_weighted * 100)
        pct_genome = '{:.1f}%'.format(result.f_match * 100)

        name = result.leaf._display_name(40)

        print_results('{:9}   {:>6}  {:>6}      {}',
                      format_bp(result.intersect_bp), pct_query, pct_genome,
                      name)
        found.append(result)

    # basic reporting
    print_results('\nfound {} matches total;', len(found))

    print_results('the recovered matches hit {:.1f}% of the query',
                  (1 - weighted_missed) * 100)
    print_results('')

    if not found:
        sys.exit(0)

    if args.output:
        fieldnames = [
            'intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query',
            'f_unique_weighted', 'average_abund', 'name', 'filename', 'md5'
        ]
        w = csv.DictWriter(args.output, fieldnames=fieldnames)
        w.writeheader()
        for result in found:
            d = dict(result._asdict())
            del d['leaf']  # actual signature not in CSV.
            w.writerow(d)

    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matches to "{}"', outname)
        sig.save_signatures([r.leaf for r in found], args.save_matches)

    if args.output_unassigned:
        if not found:
            notify('nothing found - entire query signature unassigned.')
        elif not query.minhash.get_mins():
            notify('no unassigned hashes! not saving.')
        else:
            outname = args.output_unassigned.name
            notify('saving unassigned hashes to "{}"', outname)

            e = sourmash_lib.MinHash(ksize=query.minhash.ksize,
                                     n=0,
                                     max_hash=new_max_hash)
            e.add_many(next_query.minhash.get_mins())
            sig.save_signatures([sig.SourmashSignature(e)],
                                args.output_unassigned)