def test_sbt_gather_threshold_1(): # test gather() method, in some detail factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) tree.insert(sig47) tree.insert(sig63) tree.insert(sig2) # now construct query signatures with specific numbers of hashes -- # note, these signatures all have scaled=1000. mins = list(sorted(sig2.minhash.get_mins())) new_mh = sig2.minhash.copy_and_clear() # query with empty hashes assert not new_mh assert not tree.gather(SourmashSignature(new_mh)) # add one hash new_mh.add_hash(mins.pop()) assert len(new_mh) == 1 results = tree.gather(SourmashSignature(new_mh)) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name is None # check with a threshold -> should be no results. results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000) assert not results # add three more hashes => length of 4 new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) assert len(new_mh) == 4 results = tree.gather(SourmashSignature(new_mh)) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name is None # check with a too-high threshold -> should be no results. print('len mh', len(new_mh)) results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000) assert not results
def test_build_hashCounter(): mh1 = MinHash(0, 21, scaled=1, track_abundance=True) mh2 = MinHash(0, 21, scaled=1, track_abundance=True) mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 2, 5)) true_res = Counter({1: 2, 2: 2, 3: 1, 4: 1, 5: 1}) ss1 = SourmashSignature(mh1) ss2 = SourmashSignature(mh2) counts = Counter() hc = build_hashCounter([ss1, ss2], counts) print("Hash Counter: ", hc) assert hc == true_res
def test_drop_below_mincount_threshold(): mh1 = MinHash(0, 21, scaled=1, track_abundance=True) mh2 = MinHash(0, 21, scaled=1, track_abundance=True) mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 1, 2, 5)) ss1 = SourmashSignature(mh1) ss2 = SourmashSignature(mh2) counts = Counter() hc = build_hashCounter([ss1, ss2], counts) kept_hashes = drop_below_mincount(hc, 3) true_kept = Counter({1: 3}) print("kept hashes: ", kept_hashes) assert kept_hashes == true_kept
def get_target_sig(sample_name): genome = sample_name mh = sourmash.MinHash(n=1000, ksize=31) for record in screed.open(genome): mh.add_sequence(record.sequence, True) sig = SourmashSignature(mh, name=genome) with open(sample_name + '.sig', 'wt') as fp: save_signatures([sig], fp)
def test_sbt_gather_threshold_5(): # test gather() method above threshold factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) tree.insert(sig47) tree.insert(sig63) tree.insert(sig2) # now construct query signatures with specific numbers of hashes -- # note, these signatures all have scaled=1000. mins = list(sorted(sig2.minhash.get_mins())) new_mh = sig2.minhash.copy_and_clear() # add five hashes for i in range(5): new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) new_mh.add_hash(mins.pop()) # should get a result with no threshold (any match at all is returned) results = tree.gather(SourmashSignature(new_mh)) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name is None # now, check with a threshold_bp that should be meet-able. results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000) assert len(results) == 1 containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig == sig2 assert name is None
def _signatures(self): "Create a _signatures member dictionary that contains {idx: sigobj}." from sourmash import MinHash, SourmashSignature is_protein = False is_hp = False is_dayhoff = False if self.moltype == 'protein': is_protein = True elif self.moltype == 'hp': is_hp = True elif self.moltype == 'dayhoff': is_dayhoff = True minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled, is_protein=is_protein, hp=is_hp, dayhoff=is_dayhoff) debug('creating signatures for LCA DB...') mhd = defaultdict(minhash.copy_and_clear) temp_vals = defaultdict(list) # invert the hashval_to_idx dictionary for (hashval, idlist) in self.hashval_to_idx.items(): for idx in idlist: temp_hashes = temp_vals[idx] temp_hashes.append(hashval) # 50 is an arbitrary number. If you really want # to micro-optimize, list is resized and grow in this pattern: # 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ... # (from https://github.com/python/cpython/blob/b2b4a51f7463a0392456f7772f33223e57fa4ccc/Objects/listobject.c#L57) if len(temp_hashes) > 50: mhd[idx].add_many(temp_hashes) # Sigh, python 2... when it goes away, # we can do `temp_hashes.clear()` instead. del temp_vals[idx] # We loop temp_vals again to add any remainder hashes # (each list of hashes is smaller than 50 items) for sig, vals in temp_vals.items(): mhd[sig].add_many(vals) sigd = {} for idx, mh in mhd.items(): ident = self.idx_to_ident[idx] name = self.ident_to_name[ident] sigd[idx] = SourmashSignature(mh, name=name) debug('=> {} signatures!', len(sigd)) return sigd
def load_sourmash_stream(fp): '''Iteratively parse a JSON file of sourmash signatures from the given file. Args: fp (file): File handle to parse from. ''' backend = ijson.get_backend('yajl2') for signature in backend.items(fp, 'item'): data = json.dumps([signature], cls=DecimalEncoder).encode('utf-8') size = ffi.new("uintptr_t *") ptr = rustcall(lib.signatures_load_buffer, data, len(data), False, 0, ffi.NULL, size) size = ffi.unpack(size, 1)[0] sigs = [] for i in range(size): sigs.append(SourmashSignature._from_objptr(ptr[i])) yield sigs[0]
def sketch(args): cwd = os.getcwd() db_path = os.path.join(cwd, args.name + '.db') # check for the existence of the database and tables if os.path.exists(db_path): pass else: print( "Database file not found. Please make sure the name is correct or run mashpit build." ) exit(0) fasta_folder = os.path.join(cwd, 'fasta') if os.path.exists(fasta_folder): pass else: print("Fasta folder not found.") exit(0) sig_file_name = args.name + '.sig' all_fasta_path = os.path.join(fasta_folder, "*_skeasa.fasta") genomes_list = glob.glob(all_fasta_path) minhashes = [] for genome in genomes_list: mh = MinHash(n=1000, ksize=31) for record in screed.open(genome): mh.add_sequence(record.sequence, True) minhashes.append(mh) siglist = [] for i in range(len(minhashes)): signame = genomes_list[i].strip(fasta_folder).strip('_skesa.fasta') siglist.append(SourmashSignature(minhashes[i], name=signame)) with open(sig_file_name, 'w') as f: save_signatures(siglist, fp=f)
def sketch_database(dict_files, folder, Debug, ksize_n, num_sketch): """Sketch sequence files This function generates a sourmash index, also called sketch, of the sequences provided in the folder specified. For speed reasons, we set force=True in add_sequence step to skip over k-mers containing characters other than ACTG, rather than raising an exception. :param dict_files: keys are the names of the files and values are the path to the fasta file :param folder: :param Debug: True/False to print developing messages. :param ksize_n: Kmer size value. :param num_sketch: Number of sketches to include in the hash signature. :type dict_files: Dictionary :type folder: string :type Debug: bool :type ksize_n: integer :type num_sketch: integet :returns: List of SourmashSignature signatures (siglist) and absolute path files generated (siglist_file). .. attention:: The code to implement this API function was taken and adapted from: - https://sourmash.readthedocs.io/en/latest/api-example.html - https://github.com/dib-lab/sourmash/blob/master/sourmash/commands.py .. seealso:: This function depends on sourmash python module (https://sourmash.readthedocs.io/en/latest/). Some functions employed are: - :func:`sourmash.MinHash` - :func:`sourmash.SourmashSignature` - :func:`sourmash.MinHash.add_sequence` .. include:: ../../links.inc """ ### Default: set as option ## num_sketch=5000 ## ksize_n=31 minhashes = {} for name,g in dict_files.items(): print ('\t+ Skecthing sample: ', name) E = sourmash.MinHash(n=num_sketch, ksize=ksize_n) ## generate hash according to number of sketches and kmer size for record in screed.open(g): E.add_sequence(record.sequence, True) ## in add_sequence and for speed reasons, we set force=True to skip over k-mers containing characters other than ACTG, rather than raising an exception. minhashes[name]= E ## Debug messages if Debug: print (colored("\n*** DEBUG: minhashes *****\n", 'red')) print (type(minhashes)) print (minhashes) siglist = [] siglist_file = [] ### save as signature HCGB_files.create_folder(folder) for names,hashes in minhashes.items(): sig1 = SourmashSignature(hashes, name=names) outfile_name = folder + '/' + str(names) + '.sig' with open(outfile_name, 'wt') as fp: save_signatures([sig1], fp) siglist_file.append(outfile_name) siglist.append(sig1) return(siglist_file, siglist)
def _convert_signature(sig, msg): return SourmashSignature(sig.to_sourmash(), name=f'{msg.sample_name}:{msg.t}', filename=format_filenames(msg.file_names))