def sbt_index(client, db, cell, query, ksize, nsketch, key, file): '''Create a sequence Bloom tree from a cell/ database cursor. 1. select seqs for tree 2. assign common id (field derivative.minhash.sbt.ids) 3. minhash seqs, name == UUID, md5? (think about SBT reuse) 4. query a different collection/ metagenome against this --index {raw, minhash} input: all of cell or cursor \b $ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \ reference Initialize SBT. Compute minhash signatures for selected documents. k-mer size: 16, sketch size: 1000 \ 9158 Elapsed Time: 0:01:45 Save SBT. Done. \b $ sourmash sbt_search --ksize 16 reference survey.fa.sig # running sourmash subcommand: sbt_search loaded query: survey.fa... (k=16, DNA) 0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome) TODO: add query TODO: --key arg not working? ''' c = MongoClient(client)[db][cell] print('Initialize SBT.') # init SBT factory = GraphFactory(ksize=ksize, starting_size=1e5, n_tables=4) # 4 .. nt? tree = SBT(factory, d=2) # d .. see "n-ary " in notebook print('Compute minhash signatures for selected documents.') print('{}{}{}{}'.format( 'k-mer size: ', ksize, ', sketch size: ', nsketch )) bar = ProgressBar(max_value=UnknownLength) counter = 0 for d in c.find(): counter += 1 e = Estimators(ksize=ksize, n=nsketch) e.add_sequence(d['sequence'], force=True) s = SourmashSignature(email='', estimator=e, name=deep_get(d, key)) leaf = SigLeaf(metadata=deep_get(d, key), data=s) tree.add_node(node=leaf) bar.update(counter) print('\nSave SBT.') tree.save(file) print('Done.')
def test_deep_set(): # Replacing existing values does not work. with pytest.raises(TypeError): deep_set(d, 'a.b', [1, 2]) # Key exists, item assignment not allowed w/ replace=False deep_set(d, 'a.b', [1, 2], replace=True) # Except when they evaluate to False, i.e. [], {}, "", False, None deep_set(d, 'a.c', 42) deep_set(d, 'a.d', 42) deep_set(d, 'a.e', 42) deep_set(d, 'a.f', 42) deep_set(d, 'a.g', {'foo': 'bar'}) # We can access and modify the objects in place with deep_get() deep_get(d, 'a.b').append(3) assert deep_get(d, 'a.b')[2] == 3 deep_get(d, 'a.g').update({'bar': 'foo'}) assert deep_get(d, 'a.g')['bar'] == 'foo' # We can create new, nested keys. with pytest.raises(KeyError): deep_set(d, 'a.new.nested.path', 5) # 'Key not present. Use "force=True" to create key.' deep_set(d, 'a.new.nested.path', 5, force=True) assert deep_get(d, 'a.new') == {'nested': {'path': 5}} # A new (nested) key can only be created if the "root" is a dict. with pytest.raises(AttributeError): deep_set(d, 'a.new.nested.path.below', 5, force=True) # 'int' object has no attribute 'setdefault' deep_set(d, 'a.new.nested.path', {}, replace=True) deep_set(d, 'a.new.nested.path.below', 5, force=True) assert deep_get(d, 'a.new.nested.path.below') == 5
def add(file, client, db, cell, primkey): '''Load a data cell. An alternative primary key can be specified to insert documents. This is useful in the case where the data cell comes from a collaborator who uses a different set of UUIDs as we do. In this case, these identifiers do not reflect, whether an entry is a duplicate. Example: \b $ zoo add --client localhost:27017 --db zika --cell t5 zoo/data/cell_a.json Loading data cell. 3 documents inserted in collection t5. 0 duplicates skipped. Done. \b $ zoo add --db zika --cell t5 --primkey genbank.a zoo/data/cell_b.json Loading data cell. Index created on field "genbank.a". 1 documents inserted in collection t5. 3 duplicates skipped. Done. ''' click.echo('Loading data cell.') c = MongoClient(client)[db][cell] inserted = 0 duplicates = 0 if primkey == '_id': for line in file: try: c.insert_one(json.loads(line.strip())) inserted += 1 except DuplicateKeyError: duplicates += 1 pass else: # index primkey if it does not exists yet if primkey not in c.index_information(): c.create_index(primkey, unique=True, name=primkey) print('Index created on field', '"' + primkey + '".') for line in file: d = json.loads(line.strip()) if c.find_one({primkey: deep_get(d, primkey)}): # no duplicate duplicates += 1 else: c.insert_one(d) inserted += 1 print( inserted, 'documents inserted in cell', '"' + cell + '".') if duplicates > 0: print(duplicates, 'duplicates skipped.\nDone.')
def minhash(client, db, cell, query, ksize, nsketch, key, file): '''Minhash a cell/ database cursor. just plain old sigs for collection ''' c = MongoClient(client)[db][cell] bar = ProgressBar(max_value=UnknownLength) counter = 0 l = [] print('Compute minhash signatures for selected documents.') print('{}{}{}{}'.format( 'k-mer size: ', ksize, ', sketch size: ', nsketch )) for d in c.find(): counter += 1 e = Estimators(ksize=ksize, n=nsketch) e.add_sequence(d['sequence'], force=True) s = SourmashSignature(email='', estimator=e, name=deep_get(d, key)) l.append(s) bar.update(counter) print('\nSave signatures.') signature.save_signatures(l, fp=file) print('Done.')
host = None entries = { '_id': str(uuid4()), 'metadata.location': j.country, 'metadata.date': date, 'metadata.host': host } for k, v in entries.items(): try: # NaN in host, date deep_set(d, k, v, replace=True) except AttributeError: pass deep_get(d, 'metadata.alt_id').append({'genbank': j.genbank}) deep_get(d, 'metadata.grp_id').append({'segments': j.id}) # try: # host: NaN # deep_set(d, 'metadata.host', j.host.lower(),) # except AttributeError: # pass deep_set(d, 'relative.taxonomy.subtype', j.subtype, force=True) deep_set(d, 'derivative.segment_number', j.segment_number, force=True) deep_set(d, 'derivative.length', j.seqlen, force=True) deep_set(d, 'metadata.age', j.age, force=True) deep_set(d, 'metadata.gender', j.gender, force=True) deep_set( d, 'relative.taxonomy.nomenclature', re.search('\((.*)\)', j.isolate).group(1))
def test_deep_get(): assert deep_get(d, 'a.b') == 5
from sourmash_lib.sbtmh import SigLeaf, search_minhashes from sourmash_lib.signature import SourmashSignature KSIZE = 16 N = 1000 # init SBT factory = GraphFactory(ksize=KSIZE, starting_size=1e5, n_tables=4) # 4 .. nt? tree = SBT(factory, d=2) # d .. see "n-ary " in notebook bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength) cursor = db.ref.find() c = 0 for i in cursor: key = deep_get(i, 'metadata.alt_id.gb') seq = i['sequence'] # db.ref.find_one()['sequence'] # 'ACTG...' e = Estimators(ksize=KSIZE, n=N) e.add_sequence(seq, force=True) # e.get_hashes() s = SourmashSignature(email='', estimator=e, name=key) leaf = SigLeaf(metadata=key, data=s) tree.add_node(node=leaf) c += 1 bar.update(c) # \ 9158 Elapsed Time: 0:01:49 # search the last fasta entry against the SBT (">0.95") # filtered = tree.find(search_minhashes, s, 0.1) # matches = [(str(i.metadata), i.data.similarity(s)) for i in filtered] # [('0.95', 1.0)] # fasta header, similarity