Beispiel #1
0
    def make_estimators():
        seed = args.seed
        max_hash = 0
        if args.scaled:
            max_hash = 2**64 / float(args.scaled)

        # one estimator for each ksize
        Elist = []
        for k in ksizes:
            if args.protein:
                E = sourmash_lib.Estimators(
                    ksize=k,
                    n=args.num_hashes,
                    is_protein=True,
                    track_abundance=args.track_abundance,
                    max_hash=max_hash,
                    seed=seed)
                Elist.append(E)
            if args.dna:
                E = sourmash_lib.Estimators(
                    ksize=k,
                    n=args.num_hashes,
                    is_protein=False,
                    with_cardinality=args.with_cardinality,
                    track_abundance=args.track_abundance,
                    max_hash=max_hash,
                    seed=seed)
                Elist.append(E)
        return Elist
Beispiel #2
0
 def make_estimators():
     # one estimator for each ksize
     Elist = []
     for k in ksizes:
         if args.protein:
             E = sourmash_lib.Estimators(ksize=k, n=args.num_hashes,
                                         protein=True,
                                 track_abundance=args.track_abundance)
             Elist.append(E)
         if args.dna:
             E = sourmash_lib.Estimators(ksize=k, n=args.num_hashes,
                                         protein=False,
                                 with_cardinality=args.with_cardinality,
                                 track_abundance=args.track_abundance)
             Elist.append(E)
     return Elist
Beispiel #3
0
def test_name_3(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    sig = SourmashSignature('*****@*****.**',
                            e,
                            name='foo',
                            filename='foo.txt')
    assert sig.name() == 'foo'
Beispiel #4
0
def test_save_load_multisig():
    e1 = sourmash_lib.Estimators(n=1, ksize=20)
    sig1 = SourmashSignature('*****@*****.**', e1)
    
    e2 = sourmash_lib.Estimators(n=1, ksize=20)
    sig2 = SourmashSignature('*****@*****.**', e2)

    x = save_signatures([sig1, sig2])
    y = load_signatures(x)

    print(x)

    assert len(y) == 2
    assert sig1 in y                      # order not guaranteed, note.
    assert sig2 in y
    assert sig1 != sig2
Beispiel #5
0
def to_sourmashsignature(obj,
                         is_protein=False,
                         email=None,
                         name='',
                         filename=''):

    if not isinstance(obj, MinSketch):
        raise ValueError("The obj must be a MinSketch.")

    if not obj._hashfun is mash_hashfun:
        raise ValueError("The only accepted hash function is %s." %
                         str(mash_hashfun))

    estimator = sourmash_lib.Estimators(
        n=obj.maxsize,
        ksize=obj.nsize,
        is_protein=is_protein,
        with_cardinality=False,
        track_abundance=False,
        max_hash=0,  # ???
        seed=obj.seed)
    for h in obj._heapset:
        estimator.mh.add_hash(h)

    return sourmash_lib.signature.SourmashSignature(email, estimator, name,
                                                    filename)
Beispiel #6
0
def test_roundtrip(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    e.add("AT" * 10)
    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Beispiel #7
0
def test_roundtrip_empty_email():
    e = sourmash_lib.Estimators(n=1, ksize=20)
    e.add("AT" * 10)
    sig = SourmashSignature('', e)
    s = save_signatures([sig])
    siglist = load_signatures(s)
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Beispiel #8
0
def test_roundtrip_empty(track_abundance):
    # edge case, but: empty estimator? :)
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)

    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert sig.similarity(sig2) == 0
    assert sig2.similarity(sig) == 0
Beispiel #9
0
def minhash(sequence_gen, ksize, n, **kwargs):
    '''
    as kwargs we can pass (=default)
    - with_cardinality=False
    - track_abundance=False

    Note that a tuple is returned: (_id, estimator), i.e. we continue
    to carry the _id along.
    '''
    for name, seq in sequence_gen:
        e = sourmash_lib.Estimators(n=n, ksize=ksize, **kwargs)
        e.add_sequence(seq, force=True)
        yield name, e
Beispiel #10
0
def test_roundtrip_seed(track_abundance):
    e = sourmash_lib.Estimators(n=1,
                                ksize=20,
                                track_abundance=track_abundance,
                                seed=10)
    e.mh.add_hash(5)
    sig = SourmashSignature('*****@*****.**', e)
    s = save_signatures([sig])
    siglist = list(load_signatures(s))
    sig2 = siglist[0]
    e2 = sig2.estimator

    assert e.seed == e2.seed

    assert sig.similarity(sig2) == 1.0
    assert sig2.similarity(sig) == 1.0
Beispiel #11
0
def _load_one_signature(sketch, email, name, filename, ignore_md5sum=False):
    """Helper function to unpack and check one signature block only."""
    ksize = sketch['ksize']
    mins = list(map(int, sketch['mins']))
    n = int(sketch['num'])
    molecule = sketch.get('molecule', 'dna')
    seed = sketch.get('seed', sourmash_lib.DEFAULT_SEED)
    if molecule == 'protein':
        is_protein = True
    elif molecule == 'dna':
        is_protein = False
    else:
        raise Exception("unknown molecule type: {}".format(molecule))

    max_hash = int(sketch.get('max_hash', 0))
    seed = int(sketch.get('seed', sourmash_lib.DEFAULT_SEED))

    track_abundance = 'abundances' in sketch
    e = sourmash_lib.Estimators(ksize=ksize,
                                n=n,
                                is_protein=is_protein,
                                track_abundance=track_abundance,
                                max_hash=max_hash,
                                seed=seed)
    if track_abundance:
        abundances = list(map(int, sketch['abundances']))
        e.mh.set_abundances(dict(zip(mins, abundances)))
    else:
        for m in mins:
            e.mh.add_hash(m)

    if 'cardinality' in sketch:
        e.hll = FakeHLL(int(sketch['cardinality']))

    sig = SourmashSignature(email, e)

    if not ignore_md5sum:
        md5sum = sketch['md5sum']
        if md5sum != sig.md5sum():
            raise Exception('error loading - md5 of estimator does not match')

    if name:
        sig.d['name'] = name
    if filename:
        sig.d['filename'] = filename

    return sig
Beispiel #12
0
    def import_csv(self, args):
        "Import a CSV file full of signatures/hashes."
        p = argparse.ArgumentParser()
        p.add_argument('mash_csvfile')
        p.add_argument('-o',
                       '--output',
                       type=argparse.FileType('wt'),
                       default=sys.stdout)
        p.add_argument('--email', type=str, default='')
        args = p.parse_args(args)

        with open(args.mash_csvfile, 'r') as fp:
            reader = csv.reader(fp)
            siglist = []
            for row in reader:
                hashfn = row[0]
                hashseed = int(row[1])

                # only support a limited import type, for now ;)
                assert hashfn == 'murmur64'
                assert hashseed == 42

                _, _, ksize, name, hashes = row
                ksize = int(ksize)

                hashes = hashes.strip()
                hashes = list(map(int, hashes.split(' ')))

                e = sourmash_lib.Estimators(len(hashes), ksize)
                for h in hashes:
                    e.mh.add_hash(h)
                s = sig.SourmashSignature(args.email, e, filename=name)
                siglist.append(s)
                print('loaded signature:',
                      name,
                      s.md5sum()[:8],
                      file=sys.stderr)

            print('saving %d signatures to YAML' % (len(siglist), ),
                  file=sys.stderr)
            sig.save_signatures(siglist, args.output)
Beispiel #13
0
def _load_one_signature(sketch, email, name, filename, ignore_md5sum=False):
    """Helper function to unpack and check one signature block only."""
    ksize = sketch['ksize']
    prime = sketch['prime']
    if sketch.get('type') == 'composition':
        prefixsize = sketch['prefixsize']
        n = int(sketch['subsketches']['num'])
        e = sourmash_lib.CompositionSketch(ksize=ksize,
                                           max_prime=prime,
                                           n=n,
                                           prefixsize=prefixsize)

        for item in sketch['subsketches']:
            n = item['num']
            mins = item['mins']
            n = int(n)
            for m in map(int, mins):
                e.sketches[n].mh.add_hash(m)

        sig = SourmashCompositeSignature(email, e)
    else:
        mins = list(map(int, sketch['mins']))
        n = len(mins)
        e = sourmash_lib.Estimators(ksize=ksize, max_prime=prime, n=n)
        for m in mins:
            e.mh.add_hash(m)

        sig = SourmashSignature(email, e)

    if not ignore_md5sum:
        md5sum = sketch['md5sum']
        if md5sum != sig.md5sum():
            raise Exception('error loading - md5 of estimator does not match')

    if name:
        sig.d['name'] = name
    if filename:
        sig.d['filename'] = filename

    return sig
Beispiel #14
0
def import_csv(args):
    "Import a CSV file full of signatures/hashes."
    p = argparse.ArgumentParser()
    p.add_argument('mash_csvfile')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='(default: stdout)')
    p.add_argument('--email',
                   type=str,
                   default='',
                   help='(default: %(default)s)')
    args = p.parse_args(args)

    with open(args.mash_csvfile, 'r') as fp:
        reader = csv.reader(fp)
        siglist = []
        for row in reader:
            hashfn = row[0]
            hashseed = int(row[1])

            # only support a limited import type, for now ;)
            assert hashfn == 'murmur64'
            assert hashseed == 42

            _, _, ksize, name, hashes = row
            ksize = int(ksize)

            hashes = hashes.strip()
            hashes = list(map(int, hashes.split(' ')))

            e = sourmash_lib.Estimators(len(hashes), ksize)
            e.add_many(hashes)
            s = sig.SourmashSignature(args.email, e, filename=name)
            siglist.append(s)
            notify('loaded signature: {} {}', name, s.md5sum()[:8])

        notify('saving {} signatures to JSON', len(siglist))
        sig.save_signatures(siglist, args.output)
Beispiel #15
0
def _load_one_signature(sketch, email, name, filename, ignore_md5sum=False):
    """Helper function to unpack and check one signature block only."""
    ksize = sketch['ksize']
    mins = list(map(int, sketch['mins']))
    n = int(sketch['num'])
    e = sourmash_lib.Estimators(ksize=ksize, n=n)
    for m in mins:
        e.mh.add_hash(m)

    sig = SourmashSignature(email, e)

    if not ignore_md5sum:
        md5sum = sketch['md5sum']
        if md5sum != sig.md5sum():
            raise Exception('error loading - md5 of estimator does not match')

    if name:
        sig.d['name'] = name
    if filename:
        sig.d['filename'] = filename

    return sig
Beispiel #16
0
def handler(event, context):
    print("Received Event: " + json.dumps(event, indent=2))

    # TODO: parse args from event
    args = {
      'protein': True,
      'n': 500,
      'k': 31,
#      'url': 'http://athyra.oxli.org/~luizirber/missing.fa',
      'url': 'http://athyra.oxli.org/~luizirber/reads_lt_90.fasta',
      'email': '*****@*****.**',
    }

    print("Creating estimators")
    E = sourmash_lib.Estimators(ksize=args['k'],
                                n=args['n'],
                                protein=args['protein'])

    print("Opening file")
    with closing(requests.get(args['url'], stream=True)) as r:
        for n, record in enumerate(screed.fasta.fasta_iter(r.raw)):
            if n % 500 == 0:
                print("%d reads" % n)
            if args['protein']:
                E.mh.add_protein(record.sequence)
            else:
                E.add_sequence(record.sequence)

    print("Outputing signature")
    sig = signature.SourmashSignature(
        args['email'],
        E,
        filename=args['url'])

    out = StringIO("")
    signature.save_signatures([sig], out)

    return out.getvalue()
Beispiel #17
0
def test_name_2():
    e = sourmash_lib.Estimators(n=1, ksize=20)
    sig = SourmashSignature('*****@*****.**', e, filename='foo.txt')
    assert sig.name() == 'foo.txt'
Beispiel #18
0
def _json_next_signature(iterable,
                         email=None,
                         name=None,
                         filename=None,
                         ignore_md5sum=False,
                         prefix_item='abundances.item',
                         ijson=ijson):
    """Helper function to unpack and check one signature block only.
    - iterable: an iterable such the one returned by ijson.parse()
    - email:
    - name:
    - filename:
    - ignore_md5sum:
    - prefix_item: required when parsing nested JSON structures
    - ijson: ijson backend to use.
    """
    from .signature import FakeHLL, SourmashSignature

    d = dict()
    prefix, event, value = next(iterable)
    if event == 'start_map':
        prefix, event, value = next(iterable)
    while event != 'end_map':
        key = value
        if key == 'mins':
            value = _json_next_atomic_array(iterable,
                                            prefix_item=prefix_item,
                                            ijson=ijson)
        elif key == 'abundances':
            value = _json_next_atomic_array(iterable,
                                            prefix_item=prefix_item,
                                            ijson=ijson)
        else:
            prefix, event, value = next(iterable)
        d[key] = value
        prefix, event, value = next(iterable)

    ksize = d['ksize']
    mins = d['mins']
    n = d['num']

    molecule = d.get('molecule', 'dna')
    if molecule == 'protein':
        is_protein = True
    elif molecule == 'dna':
        is_protein = False
    else:
        raise Exception("unknown molecule type: {}".format(molecule))

    track_abundance = False
    if 'abundances' in d:
        track_abundance = True

    e = sourmash_lib.Estimators(ksize=ksize,
                                n=n,
                                protein=is_protein,
                                track_abundance=track_abundance)

    if not track_abundance:
        for m in mins:
            e.mh.add_hash(m)
    else:
        abundances = list(map(int, d['abundances']))
        e.mh.set_abundances(dict(zip(mins, abundances)))

    if 'cardinality' in d:
        e.hll = FakeHLL(d['cardinality'])

    sig = SourmashSignature(email, e)

    if not ignore_md5sum:
        md5sum = d['md5sum']
        if md5sum != sig.md5sum():
            raise Exception('error loading - md5 of estimator does not match')

    if name:
        sig.d['name'] = name
    if filename:
        sig.d['filename'] = filename

    return sig
Beispiel #19
0
def test_name_4(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    sig = SourmashSignature('*****@*****.**', e)
    assert sig.name() == sig.md5sum()[:8]
Beispiel #20
0
    def watch(self, args):
        "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('--input-is-protein', action='store_true')
        sourmash_args.add_moltype_args(parser, default_dna=True)
        parser.add_argument('-n', '--num-hashes', type=int,
                            default=DEFAULT_N,
                            help='number of hashes to use in each sketch (default: %(default)i)')
        parser.add_argument('--name', type=str, default='stdin')
        args = parser.parse_args(args)

        if args.input_is_protein and args.dna:
            print('WARNING: input is protein, turning off DNA hash computing.',
                  file=sys.stderr)
            args.dna = False
            args.protein = True

        if args.dna and args.protein:
            notify('ERROR: cannot use "watch" with both DNA and protein.')

        if args.dna:
            moltype = 'DNA'
            is_protein = False
        else:
            moltype = 'protein'
            is_protein = True

        E = sourmash_lib.Estimators(ksize=args.ksize, n=args.num_hashes,
                                    protein=is_protein)
        streamsig = sig.SourmashSignature('', E, filename='stdin',
                                          name=args.name)

        notify('Computing signature for k={}, {} from stdin',
               args.ksize, moltype)


        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

        def do_search():
            search_fn = SearchMinHashesFindBest().search

            results = []
            for leaf in tree.find(search_fn, streamsig, args.threshold):
                results.append((streamsig.similarity(leaf.data),
                                leaf.data))

            return results

        notify('reading sequences from stdin')
        screed_iter = screed.open('/dev/stdin')
        watermark = WATERMARK_SIZE

        # iterate over input records
        n = 0
        for n, record in enumerate(screed_iter):
            # at each watermark, print status & check cardinality
            if n >= watermark:
                notify('... read {} sequences', n)
                watermark += WATERMARK_SIZE

                if do_search():
                    break

            if args.input_is_protein:
                E.mh.add_protein(record.sequence)
            else:
                E.add_sequence(record.sequence, False)

        results = do_search()
        if not results:
            notify('... read {} sequences, no matches found.', n)
        else:
            results.sort(key=lambda x: -x[0])   # take best
            similarity, found_sig = results[0]
            notify('FOUND: {}, at {:.3f}', found_sig.name(),
                   similarity)

        if args.output:
            sig.save_signatures([streamsig], args.output)
Beispiel #21
0
def sbt_gather(args):
    from sourmash_lib.sbt import SBT, GraphFactory
    from sourmash_lib.sbtmh import search_minhashes, SigLeaf
    from sourmash_lib.sbtmh import SearchMinHashesFindBestIgnoreMaxHash

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to search')
    parser.add_argument('query', help='query signature')
    parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
    parser.add_argument('--threshold', default=0.05, type=float)
    parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
    parser.add_argument('--csv', type=argparse.FileType('wt'))
    parser.add_argument('--save-matches', type=argparse.FileType('wt'))

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
    query = sourmash_args.load_query_signature(args.query,
                                               select_ksize=args.ksize,
                                               select_moltype=moltype)
    query_moltype = sourmash_args.get_moltype(query)
    query_ksize = query.estimator.ksize
    notify('loaded query: {}... (k={}, {})',
           query.name()[:30], query_ksize, query_moltype)

    if query.estimator.max_hash == 0:
        error('query signature needs to be created with --scaled')
        error('or using --with-cardinality.')
        sys.exit(-1)

    notify('query signature has max_hash: {}', query.estimator.max_hash)
    orig_query = query

    R_metagenome = 2**64 / float(orig_query.estimator.max_hash)

    new_mins = query.estimator.get_hashes()
    e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
    e.update(query.estimator)
    query = sig.SourmashSignature('', e)

    sum_found = 0.
    found = []
    while 1:
        search_fn = SearchMinHashesFindBestIgnoreMaxHash().search

        results = []
        # use super low threshold for this part of the search
        for leaf in tree.find(search_fn, query, 0.00001):
            results.append((query.estimator.similarity_ignore_maxhash(
                leaf.data.estimator), leaf.data))

        if not len(results):  # no matches at all!
            break

        # take the best result
        results.sort(key=lambda x: -x[0])  # reverse sort on similarity
        best_sim, best_ss = results[0]

        # subtract found hashes from search hashes, construct new search
        new_mins = set(query.estimator.get_hashes())
        found_mins = best_ss.estimator.get_hashes()

        if best_ss.estimator.max_hash:
            R_genome = 2**64 / float(best_ss.estimator.max_hash)
        elif best_ss.estimator.hll:
            genome_size = best_ss.estimator.hll.estimate_cardinality()
            genome_max_hash = max(found_mins)
            R_genome = float(genome_size) / float(genome_max_hash)
        else:
            error('Best hash match in sbt_gather has no cardinality')
            error('Please prepare database of sequences with --scaled')
            error('...or with --with-cardinality')
            sys.exit(-1)

        R_comparison = max(R_metagenome, R_genome)
        new_max_hash = 2**64 / float(R_comparison)
        new_mins = set([i for i in new_mins if i < new_max_hash])
        found_mins = set([i for i in found_mins if i < new_max_hash])

        # intersection:
        intersect_mins = new_mins.intersection(found_mins)

        if len(intersect_mins) < 5:  # hard cutoff for now
            notify('found only {} hashes in common.', len(intersect_mins))
            notify('this is below a sane threshold => exiting.')
            break

        # first denominator - genome size
        genome_n_mins = len(found_mins)
        f_genome = len(intersect_mins) / float(genome_n_mins)

        # second denominator - metagenome size
        query_n_mins = len(orig_query.estimator.get_hashes())
        f_query = len(intersect_mins) / float(query_n_mins)

        # print interim & save
        notify('found: {:.2f} {:.2f} {}', f_genome, f_query, best_ss.name())
        found.append((f_genome, best_ss))

        new_mins -= set(found_mins)
        e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
        e.add_many(new_mins)
        query = sig.SourmashSignature('', e)

    notify('found {}, total fraction {:.3f}', len(found), sum_found)
    notify('')

    if not found:
        sys.exit(0)

    found.sort(key=lambda x: x[0])
    found.reverse()

    notify('Composition:')
    for (frac, leaf_sketch) in found:
        notify('{:.2f} {}', frac, leaf_sketch.name())

    if args.output:
        print('Composition:', file=args.output)
        for (frac, leaf_sketch) in found:
            print('{:.2f} {}'.format(frac, leaf_sketch.name()),
                  file=args.output)

    if args.csv:
        fieldnames = ['fraction', 'name', 'sketch_kmers']
        w = csv.DictWriter(args.csv, fieldnames=fieldnames)

        w.writeheader()
        for (frac, leaf_sketch) in found:
            cardinality = leaf_sketch.estimator.hll.estimate_cardinality()
            w.writerow(
                dict(fraction=frac,
                     name=leaf_sketch.name(),
                     sketch_kmers=cardinality))
    if args.save_matches:
        outname = args.save_matches.name
        notify('saving all matches to "{}"', outname)
        sig.save_signatures([ss for (f, ss) in found], args.save_matches)
Beispiel #22
0
    def compute(self, args):
        "Compute the signature for one or more files."
        parser = argparse.ArgumentParser()
        parser.add_argument('filenames', nargs='+')
        parser.add_argument('--protein', action='store_true')
        parser.add_argument('--input-is-protein', action='store_true')
        parser.add_argument('-k',
                            '--ksizes',
                            default=str(DEFAULT_K),
                            help='comma-separated list of k-mer sizes')
        parser.add_argument('-n',
                            '--num-hashes',
                            type=int,
                            default=DEFAULT_N,
                            help='number of hashes to use in each sketch')
        parser.add_argument('-f', '--force', action='store_true')
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('--email', type=str, default='')
        args = parser.parse_args(args)

        print('computing signatures for files:',
              args.filenames,
              file=sys.stderr)

        # get list of k-mer sizes for which to compute sketches
        ksizes = args.ksizes
        if ',' in ksizes:
            ksizes = ksizes.split(',')
            ksizes = list(map(int, ksizes))
        else:
            ksizes = [int(ksizes)]

        print('Computing signature for ksizes: %s' % str(ksizes),
              file=sys.stderr)

        # for each file, load & compute sketch.
        for filename in args.filenames:
            sigfile = os.path.basename(filename) + '.sig'
            if not args.output and os.path.exists(sigfile) and not args.force:
                print('skipping', filename, '- already done', file=sys.stderr)
                continue

            # one estimator for each ksize
            Elist = []
            for k in ksizes:
                E = sourmash_lib.Estimators(ksize=k,
                                            n=args.num_hashes,
                                            protein=args.protein)
                Elist.append(E)

            # consume & calculate signatures
            print('... reading sequences from', filename, file=sys.stderr)
            for n, record in enumerate(screed.open(filename)):
                if n % 10000 == 0 and n:
                    print('...', filename, n, file=sys.stderr)

                s = record.sequence
                for E in Elist:
                    if args.input_is_protein:
                        E.mh.add_protein(s)
                    else:
                        E.add_sequence(s, args.force)

            # convert into a signature
            siglist = [
                sig.SourmashSignature(args.email, E, filename=filename)
                for E in Elist
            ]

            # save!
            if args.output:
                data = sig.save_signatures(siglist, args.output)
            else:
                with open(sigfile, 'w') as fp:
                    data = sig.save_signatures(siglist, fp)
Beispiel #23
0
def kmers(seq, k):
    for start in range(len(seq) - k + 1):
        yield seq[start:start + k]


###

K = 21

import sys, screed
import mmh3
import sourmash_lib
print('imported sourmash:', sourmash_lib, file=sys.stderr)
from sourmash_lib import MinHash
import sourmash_lib.signature

record = next(iter(screed.open(sys.argv[1])))
print('loaded', record.name, file=sys.stderr)

E = sourmash_lib.Estimators(ksize=K, n=500, protein=True)
prot_ksize = int(K / 3)
mh = E.mh

for trans in translate(record.sequence):
    for kmer in kmers(trans, prot_ksize):
        hash = mmh3.hash128(kmer, seed=42)
        mh.add_hash(hash)

s = sourmash_lib.signature.SourmashSignature('', E, name=record.name)
print(sourmash_lib.signature.save_signatures([s]))
Beispiel #24
0
               p_index[i] + "_sequences.txt", 'r')  # /data/scratch/kjacks21/
    print("Working on bag " + str(i) + "...")
    for j, seq in enumerate(seqs):
        if ((j % 1000) == 0):
            now = dt.datetime.now()
            print("Currently on bag " + str(i) + ", instance " + str(j) +
                  " at time " + str(now - start))
        # set max_sim to 1 since the sequence is in the same bag
        if (j >= patients[i, 1] and j <= patients[i, 2]):
            max_sim = 1
            #print("max_sim = 1")
        else:
            # write seq file (test1.txt) in fasta format
            #with open("test1.txt", 'w') as f:
            #    f.write(">\n"+str(seq))
            E1 = sourmash_lib.Estimators(n=50, ksize=10)
            E1.add(seq.strip())

            max_sim = 0
            for b_seq in bag:
                prev_max_sim = max_sim

                # write b_seq fasta file (test2.txt) for mash
                #with open("test2.txt", 'w') as f:
                #    f.write(">\n"+str(b_seq))

                # calculate mash dist between
                #proc = sp.Popen(["/home/kyle/Documents/cs584/mash/mash-Linux64-v1.1.1/mash","dist","-k","15","-r","-p","3","test1.txt","test2.txt"], stdout=sp.PIPE)
                #output = str((proc.stdout.readline()), 'UTF8')
                #mash_dist = float((output.split('\t'))[2])
                E2 = sourmash_lib.Estimators(n=50, ksize=10)
Beispiel #25
0
def test_md5(track_abundance):
    e = sourmash_lib.Estimators(n=1, ksize=20, track_abundance=track_abundance)
    e.mh.add_hash(5)
    sig = SourmashSignature('*****@*****.**', e)
    print(sig._save())
    assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum()
Beispiel #26
0
def test_md5():
    e = sourmash_lib.Estimators(n=1, ksize=20)
    e.mh.add_hash(5)
    sig = SourmashSignature('*****@*****.**', e)
    print(sig.save())
    assert sig.md5sum() == 'e4da3b7fbbce2345d7772b0674a318d5', sig.md5sum()
Beispiel #27
0
    def sbt_gather(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('query')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('--csv', type=argparse.FileType('wt'))

        sourmash_args.add_moltype_args(parser)

        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        sl = sig.load_signatures(args.query, select_ksize=args.ksize,
                                 select_moltype=moltype)
        sl = list(sl)
        if len(sl) != 1:
            print('When loading query from "{}",'.format(args.query),
                  file=sys.stderr)
            print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl)))
            sys.exit(-1)

        query = sl[0]

        query_moltype = 'UNKNOWN'
        if query.estimator.is_molecule_type('dna'):
            query_moltype = 'DNA'
        elif query.estimator.is_molecule_type('protein'):
            query_moltype = 'protein'
        query_ksize = query.estimator.ksize
        print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                      query_ksize,
                                                      query_moltype))

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        #s = sig.load_signatures(args.query, select_ksize=args.ksize)
        orig_query = query

        sum_found = 0.
        found = []
        while 1:
            search_fn = SearchMinHashesFindBest().search

            results = []
            # use super low threshold for this part of the search
            for leaf in tree.find(search_fn, query, 0.00001):
                results.append((query.similarity(leaf.data), leaf.data))
                #results.append((leaf.data.similarity(ss), leaf.data))

            if not len(results):          # no matches at all!
                break

            # take the best result
            results.sort(key=lambda x: -x[0])   # reverse sort on similarity
            best_sim, best_ss = results[0]
            sim = best_ss.similarity(orig_query)

            # adjust by size of leaf (kmer cardinality of original genome)
            if best_ss.estimator.hll:
                leaf_kmers = best_ss.estimator.hll.estimate_cardinality()
                query_kmers = orig_query.estimator.hll.estimate_cardinality()
                f_of_total = leaf_kmers / query_kmers * sim
            else:
                f_of_total = 0

            if not found and sim < args.threshold:
                print('best match: {}'.format(best_ss.name()))
                print('similarity is {:.5f} of db signature;'.format(sim))
                print('this is below specified threshold => exiting.')
                break

            # subtract found hashes from search hashes, construct new search
            new_mins = set(query.estimator.mh.get_mins())
            found_mins = best_ss.estimator.mh.get_mins()

            # print interim & save
            print('found: {:.2f} {} {}'.format(f_of_total,
                                               len(new_mins),
                                               best_ss.name()))
            found.append((f_of_total, best_ss, sim))
            sum_found += f_of_total

            new_mins -= set(found_mins)
            e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
            for m in new_mins:
                e.mh.add_hash(m)
            new_ss = sig.SourmashSignature('foo', e)
            query = new_ss

        print('found {}, total fraction {:.3f}'.format(len(found), sum_found))
        print('')

        if not found:
            sys.exit(0)

        found.sort()
        found.reverse()

        print('Composition:')
        for (frac, leaf_sketch, sim) in found:
            print('{:.2f} {}'.format(frac, leaf_sketch.name()))

        if args.output:
            print('Composition:', file=args.output)
            for (frac, leaf_sketch, sim) in found:
                print('{:.2f} {}'.format(frac, leaf_sketch.name()),
                      file=args.output)

        if args.csv:
            fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers']
            w = csv.DictWriter(args.csv, fieldnames=fieldnames)

            w.writeheader()
            for (frac, leaf_sketch, sim) in found:
                cardinality = leaf_sketch.estimator.hll.estimate_cardinality()
                w.writerow(dict(fraction=frac, name=leaf_sketch.name(),
                                similarity=sim,
                                sketch_kmers=cardinality))