コード例 #1
0
ファイル: utils.py プロジェクト: luizirber/zoo
def save_minhash(minhash_gen, handle=None, email=''):  #
    '''
    issue:
    https://github.com/dib-lab/sourmash/issues/131
    suggested lead:
    https://github.com/dib-lab/sourmash/blob/master/utils/compute-dna-mh-another-way.py
    relevant set of functions:
    https://github.com/dib-lab/sourmash/blob/master/sourmash_lib/signature.py

    from itertools import islice
    print(save_minhash(islice(gen, 2)))

    fp = '/some/path/to.json'
    with open(fp, 'w+') as outfile:
        save_minhash(islice(gen, 2), handle=outfile)
    '''
    l = []
    bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
    counter = 0

    print('Generating signatures ...')
    for mh in minhash_gen:
        name, e = mh
        s = signature.SourmashSignature(email, e, name=name, filename='zoo')
        l.append(s)
        # load all to memory ... bad, TODO: stream into file handle

        counter += 1
        bar.update(counter)

    print('\nSaving signatures ...')
    if handle is None:
        return signature.save_signatures(l)  # [s] instead of l
    else:
        return signature.save_signatures(l, handle)
コード例 #2
0
def test_sourmash_compare_with_abundance_1():
    with utils.TempDirectory() as location:
        # create two signatures
        E1 = Estimators(ksize=5, n=5, protein=False, track_abundance=True)
        E2 = Estimators(ksize=5, n=5, protein=False, track_abundance=True)

        E1.mh.add_sequence('ATGGA')
        E2.mh.add_sequence('ATGGA')

        s1 = signature.SourmashSignature('', E1, filename='e1', name='e1')
        s2 = signature.SourmashSignature('', E2, filename='e2', name='e2')

        signature.save_signatures([s1],
                                  open(os.path.join(location, 'e1.sig'), 'w'))
        signature.save_signatures([s2],
                                  open(os.path.join(location, 'e2.sig'), 'w'))

        status, out, err = utils.runscript(
            'sourmash', ['search', 'e1.sig', 'e2.sig', '-k', '5'],
            in_directory=location)
        assert '1.000' in out
コード例 #3
0
ファイル: __main__.py プロジェクト: camillescott/sourmash
    def import_csv(self, args):
        "Import a CSV file full of signatures/hashes."
        p = argparse.ArgumentParser()
        p.add_argument('mash_csvfile')
        p.add_argument('-o',
                       '--output',
                       type=argparse.FileType('wt'),
                       default=sys.stdout)
        p.add_argument('--email', type=str, default='')
        args = p.parse_args(args)

        with open(args.mash_csvfile, 'r') as fp:
            reader = csv.reader(fp)
            siglist = []
            for row in reader:
                hashfn = row[0]
                hashseed = int(row[1])

                # only support a limited import type, for now ;)
                assert hashfn == 'murmur64'
                assert hashseed == 42

                _, _, ksize, name, hashes = row
                ksize = int(ksize)

                hashes = hashes.strip()
                hashes = list(map(int, hashes.split(' ')))

                e = sourmash_lib.Estimators(len(hashes), ksize)
                for h in hashes:
                    e.mh.add_hash(h)
                s = sig.SourmashSignature(args.email, e, filename=name)
                siglist.append(s)
                print('loaded signature:',
                      name,
                      s.md5sum()[:8],
                      file=sys.stderr)

            print('saving %d signatures to YAML' % (len(siglist), ),
                  file=sys.stderr)
            sig.save_signatures(siglist, args.output)
コード例 #4
0
ファイル: code.py プロジェクト: dib-lab/soursigs
def handler(event, context):
    print("Received Event: " + json.dumps(event, indent=2))

    # TODO: parse args from event
    args = {
      'protein': True,
      'n': 500,
      'k': 31,
#      'url': 'http://athyra.oxli.org/~luizirber/missing.fa',
      'url': 'http://athyra.oxli.org/~luizirber/reads_lt_90.fasta',
      'email': '*****@*****.**',
    }

    print("Creating estimators")
    E = sourmash_lib.Estimators(ksize=args['k'],
                                n=args['n'],
                                protein=args['protein'])

    print("Opening file")
    with closing(requests.get(args['url'], stream=True)) as r:
        for n, record in enumerate(screed.fasta.fasta_iter(r.raw)):
            if n % 500 == 0:
                print("%d reads" % n)
            if args['protein']:
                E.mh.add_protein(record.sequence)
            else:
                E.add_sequence(record.sequence)

    print("Outputing signature")
    sig = signature.SourmashSignature(
        args['email'],
        E,
        filename=args['url'])

    out = StringIO("")
    signature.save_signatures([sig], out)

    return out.getvalue()
コード例 #5
0
def commit(file, client, db, cell, ksize, n):
    '''Dump a (mongodb) cursor to a data cell.

    For each document, start a new line in the output.

    file argument: Filename prefix w/o extension.

    \b
    {"_id":"86853586-5e9...
    {"_id":"689e59b8-514...
    {"_id":"6d9bff35-aab...

    This is important bc/ it circumvents the need to hold more than one record
    in memory, both on import and export. Note also that this is the same
    output format as ...

    \b
    $ mongoexport --db foo --collection bar --out bar.json
    ... and can be reimported by
    $ mongoimport --db foo --collection bar2 bar.json

    Example:

    $ zoo commit --db zika --cell survey --n 5 surveytest
    '''
    click.echo('Dumping data cell.')
    db = MongoClient(client)[db]

    # initialize minhash
    ksize = [int(i) for i in ksize.split(',')]
    dk = {k: Estimators(ksize=k, n=n) for k in ksize}

    bar = ProgressBar(max_value=UnknownLength)
    counter = 0
    with open(file + '.json', 'w+') as f:
        for d in db[cell].find():
            counter += 1

            # calculate fresh md5 hash for each record
            _id = d.pop('_id')
            # Neither the primary key (because it is random)
            # nor the checksum should figure in the checksum.
            try:
                del d['md5']
            except KeyError:
                pass
            d['md5'] = hash_dict(d)
            d['_id'] = _id
            f.write(json.dumps(d, indent=None, sort_keys=True) + '\n')

            # update aggregate minhash for collection
            for v in dk.values():
                v.add_sequence(d['sequence'], force=True)

            # update progress bar
            bar.update(counter)

    # save minhash
    for k, v in dk.items():
        dk.update({
            k: signature.SourmashSignature(
                estimator=v, name=cell, email='', filename='')})
    # print('\n', ksize[0], ksize[1], n)

    with open(file + '.zoo', 'w+') as f:
        signature_json.save_signatures_json(
            dk.values(), fp=f, indent=4, sort_keys=True)
    click.echo('\nDone.')
コード例 #6
0
ファイル: __main__.py プロジェクト: lgautier/sourmash
    def watch(self, args):
        "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('--input-is-protein', action='store_true')
        sourmash_args.add_moltype_args(parser, default_dna=True)
        parser.add_argument('-n', '--num-hashes', type=int,
                            default=DEFAULT_N,
                            help='number of hashes to use in each sketch (default: %(default)i)')
        parser.add_argument('--name', type=str, default='stdin')
        args = parser.parse_args(args)

        if args.input_is_protein and args.dna:
            print('WARNING: input is protein, turning off DNA hash computing.',
                  file=sys.stderr)
            args.dna = False
            args.protein = True

        if args.dna and args.protein:
            notify('ERROR: cannot use "watch" with both DNA and protein.')

        if args.dna:
            moltype = 'DNA'
            is_protein = False
        else:
            moltype = 'protein'
            is_protein = True

        E = sourmash_lib.Estimators(ksize=args.ksize, n=args.num_hashes,
                                    protein=is_protein)
        streamsig = sig.SourmashSignature('', E, filename='stdin',
                                          name=args.name)

        notify('Computing signature for k={}, {} from stdin',
               args.ksize, moltype)


        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)

        def do_search():
            search_fn = SearchMinHashesFindBest().search

            results = []
            for leaf in tree.find(search_fn, streamsig, args.threshold):
                results.append((streamsig.similarity(leaf.data),
                                leaf.data))

            return results

        notify('reading sequences from stdin')
        screed_iter = screed.open('/dev/stdin')
        watermark = WATERMARK_SIZE

        # iterate over input records
        n = 0
        for n, record in enumerate(screed_iter):
            # at each watermark, print status & check cardinality
            if n >= watermark:
                notify('... read {} sequences', n)
                watermark += WATERMARK_SIZE

                if do_search():
                    break

            if args.input_is_protein:
                E.mh.add_protein(record.sequence)
            else:
                E.add_sequence(record.sequence, False)

        results = do_search()
        if not results:
            notify('... read {} sequences, no matches found.', n)
        else:
            results.sort(key=lambda x: -x[0])   # take best
            similarity, found_sig = results[0]
            notify('FOUND: {}, at {:.3f}', found_sig.name(),
                   similarity)

        if args.output:
            sig.save_signatures([streamsig], args.output)
コード例 #7
0
ファイル: __main__.py プロジェクト: lgautier/sourmash
    def sbt_gather(self, args):
        from sourmash_lib.sbt import SBT, GraphFactory
        from sourmash_lib.sbtmh import search_minhashes, SigLeaf
        from sourmash_lib.sbtmh import SearchMinHashesFindBest

        parser = argparse.ArgumentParser()
        parser.add_argument('sbt_name')
        parser.add_argument('query')
        parser.add_argument('-k', '--ksize', type=int, default=DEFAULT_K)
        parser.add_argument('--threshold', default=0.05, type=float)
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('--csv', type=argparse.FileType('wt'))

        sourmash_args.add_moltype_args(parser)

        args = parser.parse_args(args)

        if args.protein:
            if args.dna is True:
                raise Exception('cannot specify both --dna and --protein!')
            args.dna = False

        moltype = None
        if args.protein:
            moltype = 'protein'
        elif args.dna:
            moltype = 'dna'

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        sl = sig.load_signatures(args.query, select_ksize=args.ksize,
                                 select_moltype=moltype)
        sl = list(sl)
        if len(sl) != 1:
            print('When loading query from "{}",'.format(args.query),
                  file=sys.stderr)
            print('{} query signatures matching ksize and molecule type; need exactly one.'.format(len(sl)))
            sys.exit(-1)

        query = sl[0]

        query_moltype = 'UNKNOWN'
        if query.estimator.is_molecule_type('dna'):
            query_moltype = 'DNA'
        elif query.estimator.is_molecule_type('protein'):
            query_moltype = 'protein'
        query_ksize = query.estimator.ksize
        print('loaded query: {}... (k={}, {})'.format(query.name()[:30],
                                                      query_ksize,
                                                      query_moltype))

        tree = SBT.load(args.sbt_name, leaf_loader=SigLeaf.load)
        #s = sig.load_signatures(args.query, select_ksize=args.ksize)
        orig_query = query

        sum_found = 0.
        found = []
        while 1:
            search_fn = SearchMinHashesFindBest().search

            results = []
            # use super low threshold for this part of the search
            for leaf in tree.find(search_fn, query, 0.00001):
                results.append((query.similarity(leaf.data), leaf.data))
                #results.append((leaf.data.similarity(ss), leaf.data))

            if not len(results):          # no matches at all!
                break

            # take the best result
            results.sort(key=lambda x: -x[0])   # reverse sort on similarity
            best_sim, best_ss = results[0]
            sim = best_ss.similarity(orig_query)

            # adjust by size of leaf (kmer cardinality of original genome)
            if best_ss.estimator.hll:
                leaf_kmers = best_ss.estimator.hll.estimate_cardinality()
                query_kmers = orig_query.estimator.hll.estimate_cardinality()
                f_of_total = leaf_kmers / query_kmers * sim
            else:
                f_of_total = 0

            if not found and sim < args.threshold:
                print('best match: {}'.format(best_ss.name()))
                print('similarity is {:.5f} of db signature;'.format(sim))
                print('this is below specified threshold => exiting.')
                break

            # subtract found hashes from search hashes, construct new search
            new_mins = set(query.estimator.mh.get_mins())
            found_mins = best_ss.estimator.mh.get_mins()

            # print interim & save
            print('found: {:.2f} {} {}'.format(f_of_total,
                                               len(new_mins),
                                               best_ss.name()))
            found.append((f_of_total, best_ss, sim))
            sum_found += f_of_total

            new_mins -= set(found_mins)
            e = sourmash_lib.Estimators(ksize=args.ksize, n=len(new_mins))
            for m in new_mins:
                e.mh.add_hash(m)
            new_ss = sig.SourmashSignature('foo', e)
            query = new_ss

        print('found {}, total fraction {:.3f}'.format(len(found), sum_found))
        print('')

        if not found:
            sys.exit(0)

        found.sort()
        found.reverse()

        print('Composition:')
        for (frac, leaf_sketch, sim) in found:
            print('{:.2f} {}'.format(frac, leaf_sketch.name()))

        if args.output:
            print('Composition:', file=args.output)
            for (frac, leaf_sketch, sim) in found:
                print('{:.2f} {}'.format(frac, leaf_sketch.name()),
                      file=args.output)

        if args.csv:
            fieldnames = ['fraction', 'name', 'similarity', 'sketch_kmers']
            w = csv.DictWriter(args.csv, fieldnames=fieldnames)

            w.writeheader()
            for (frac, leaf_sketch, sim) in found:
                cardinality = leaf_sketch.estimator.hll.estimate_cardinality()
                w.writerow(dict(fraction=frac, name=leaf_sketch.name(),
                                similarity=sim,
                                sketch_kmers=cardinality))
コード例 #8
0
ファイル: __main__.py プロジェクト: lgautier/sourmash
 def build_siglist(email, Elist, filename, name=None):
     return [ sig.SourmashSignature(email, E, filename=filename,
                                    name=name) for E in Elist ]
コード例 #9
0
ファイル: __main__.py プロジェクト: camillescott/sourmash
    def compute(self, args):
        "Compute the signature for one or more files."
        parser = argparse.ArgumentParser()
        parser.add_argument('filenames', nargs='+')
        parser.add_argument('--protein', action='store_true')
        parser.add_argument('--input-is-protein', action='store_true')
        parser.add_argument('-k',
                            '--ksizes',
                            default=str(DEFAULT_K),
                            help='comma-separated list of k-mer sizes')
        parser.add_argument('-n',
                            '--num-hashes',
                            type=int,
                            default=DEFAULT_N,
                            help='number of hashes to use in each sketch')
        parser.add_argument('-f', '--force', action='store_true')
        parser.add_argument('-o', '--output', type=argparse.FileType('wt'))
        parser.add_argument('--email', type=str, default='')
        args = parser.parse_args(args)

        print('computing signatures for files:',
              args.filenames,
              file=sys.stderr)

        # get list of k-mer sizes for which to compute sketches
        ksizes = args.ksizes
        if ',' in ksizes:
            ksizes = ksizes.split(',')
            ksizes = list(map(int, ksizes))
        else:
            ksizes = [int(ksizes)]

        print('Computing signature for ksizes: %s' % str(ksizes),
              file=sys.stderr)

        # for each file, load & compute sketch.
        for filename in args.filenames:
            sigfile = os.path.basename(filename) + '.sig'
            if not args.output and os.path.exists(sigfile) and not args.force:
                print('skipping', filename, '- already done', file=sys.stderr)
                continue

            # one estimator for each ksize
            Elist = []
            for k in ksizes:
                E = sourmash_lib.Estimators(ksize=k,
                                            n=args.num_hashes,
                                            protein=args.protein)
                Elist.append(E)

            # consume & calculate signatures
            print('... reading sequences from', filename, file=sys.stderr)
            for n, record in enumerate(screed.open(filename)):
                if n % 10000 == 0 and n:
                    print('...', filename, n, file=sys.stderr)

                s = record.sequence
                for E in Elist:
                    if args.input_is_protein:
                        E.mh.add_protein(s)
                    else:
                        E.add_sequence(s, args.force)

            # convert into a signature
            siglist = [
                sig.SourmashSignature(args.email, E, filename=filename)
                for E in Elist
            ]

            # save!
            if args.output:
                data = sig.save_signatures(siglist, args.output)
            else:
                with open(sigfile, 'w') as fp:
                    data = sig.save_signatures(siglist, fp)
コード例 #10
0
ファイル: survey.py プロジェクト: luizirber/zoo
        record['sequence'] = str(i)
        record['_id'] = _id
        db.survey.insert_one(record)

# db.survey.count()
# 33
json_dump('survey.json', db.survey.find())

# now create a minhash of the entire sequences to search
e16 = Estimators(ksize=16, n=1000)
e31 = Estimators(ksize=31, n=1000)
fn = 'survey.sig'

cursor = db.survey.find({}, {'sequence': 1, '_id': 0})
for record in cursor:
    for k, v in record.items():
        e16.add_sequence(v, force=True)
        e31.add_sequence(v, force=True)
        # force bc/ ValueError: invalid DNA character in sequence: Y
s16 = signature.SourmashSignature(email='',
                                  estimator=e16,
                                  name='survey',
                                  filename=fn)
s31 = signature.SourmashSignature(email='',
                                  estimator=e31,
                                  name='survey',
                                  filename=fn)

with open(fn, 'w+') as outsig:
    signature.save_signatures([s16, s31], fp=outsig)