Exemple #1
0
def test_sbt_gather_threshold_1():
    # test gather() method, in some detail
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=2)

    sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31)
    sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31)
    sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31)

    tree.insert(sig47)
    tree.insert(sig63)
    tree.insert(sig2)

    # now construct query signatures with specific numbers of hashes --
    # note, these signatures all have scaled=1000.

    mins = list(sorted(sig2.minhash.get_mins()))
    new_mh = sig2.minhash.copy_and_clear()

    # query with empty hashes
    assert not new_mh
    assert not tree.gather(SourmashSignature(new_mh))

    # add one hash
    new_mh.add_hash(mins.pop())
    assert len(new_mh) == 1

    results = tree.gather(SourmashSignature(new_mh))
    assert len(results) == 1
    containment, match_sig, name = results[0]
    assert containment == 1.0
    assert match_sig == sig2
    assert name is None

    # check with a threshold -> should be no results.
    results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000)
    assert not results

    # add three more hashes => length of 4
    new_mh.add_hash(mins.pop())
    new_mh.add_hash(mins.pop())
    new_mh.add_hash(mins.pop())
    assert len(new_mh) == 4

    results = tree.gather(SourmashSignature(new_mh))
    assert len(results) == 1
    containment, match_sig, name = results[0]
    assert containment == 1.0
    assert match_sig == sig2
    assert name is None

    # check with a too-high threshold -> should be no results.
    print('len mh', len(new_mh))
    results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000)
    assert not results
Exemple #2
0
def test_build_hashCounter():
    mh1 = MinHash(0, 21, scaled=1, track_abundance=True)
    mh2 = MinHash(0, 21, scaled=1, track_abundance=True)
    mh1.add_many((1, 2, 3, 4))
    mh2.add_many((1, 2, 5))
    true_res = Counter({1: 2, 2: 2, 3: 1, 4: 1, 5: 1})

    ss1 = SourmashSignature(mh1)
    ss2 = SourmashSignature(mh2)

    counts = Counter()
    hc = build_hashCounter([ss1, ss2], counts)
    print("Hash Counter: ", hc)
    assert hc == true_res
Exemple #3
0
def test_drop_below_mincount_threshold():
    mh1 = MinHash(0, 21, scaled=1, track_abundance=True)
    mh2 = MinHash(0, 21, scaled=1, track_abundance=True)
    mh1.add_many((1, 2, 3, 4))
    mh2.add_many((1, 1, 2, 5))

    ss1 = SourmashSignature(mh1)
    ss2 = SourmashSignature(mh2)

    counts = Counter()
    hc = build_hashCounter([ss1, ss2], counts)
    kept_hashes = drop_below_mincount(hc, 3)
    true_kept = Counter({1: 3})
    print("kept hashes: ", kept_hashes)
    assert kept_hashes == true_kept
Exemple #4
0
def get_target_sig(sample_name):
    genome = sample_name
    mh = sourmash.MinHash(n=1000, ksize=31)
    for record in screed.open(genome):
        mh.add_sequence(record.sequence, True)
    sig = SourmashSignature(mh, name=genome)
    with open(sample_name + '.sig', 'wt') as fp:
        save_signatures([sig], fp)
Exemple #5
0
def test_sbt_gather_threshold_5():
    # test gather() method above threshold
    factory = GraphFactory(31, 1e5, 4)
    tree = SBT(factory, d=2)

    sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31)
    sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31)
    sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31)

    tree.insert(sig47)
    tree.insert(sig63)
    tree.insert(sig2)

    # now construct query signatures with specific numbers of hashes --
    # note, these signatures all have scaled=1000.

    mins = list(sorted(sig2.minhash.get_mins()))
    new_mh = sig2.minhash.copy_and_clear()

    # add five hashes
    for i in range(5):
        new_mh.add_hash(mins.pop())
        new_mh.add_hash(mins.pop())
        new_mh.add_hash(mins.pop())
        new_mh.add_hash(mins.pop())
        new_mh.add_hash(mins.pop())

    # should get a result with no threshold (any match at all is returned)
    results = tree.gather(SourmashSignature(new_mh))
    assert len(results) == 1
    containment, match_sig, name = results[0]
    assert containment == 1.0
    assert match_sig == sig2
    assert name is None

    # now, check with a threshold_bp that should be meet-able.
    results = tree.gather(SourmashSignature(new_mh), threshold_bp=5000)
    assert len(results) == 1
    containment, match_sig, name = results[0]
    assert containment == 1.0
    assert match_sig == sig2
    assert name is None
Exemple #6
0
    def _signatures(self):
        "Create a _signatures member dictionary that contains {idx: sigobj}."
        from sourmash import MinHash, SourmashSignature

        is_protein = False
        is_hp = False
        is_dayhoff = False
        if self.moltype == 'protein':
            is_protein = True
        elif self.moltype == 'hp':
            is_hp = True
        elif self.moltype == 'dayhoff':
            is_dayhoff = True
        minhash = MinHash(n=0,
                          ksize=self.ksize,
                          scaled=self.scaled,
                          is_protein=is_protein,
                          hp=is_hp,
                          dayhoff=is_dayhoff)

        debug('creating signatures for LCA DB...')
        mhd = defaultdict(minhash.copy_and_clear)
        temp_vals = defaultdict(list)

        # invert the hashval_to_idx dictionary
        for (hashval, idlist) in self.hashval_to_idx.items():
            for idx in idlist:
                temp_hashes = temp_vals[idx]
                temp_hashes.append(hashval)

                # 50 is an arbitrary number. If you really want
                # to micro-optimize, list is resized and grow in this pattern:
                # 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
                # (from https://github.com/python/cpython/blob/b2b4a51f7463a0392456f7772f33223e57fa4ccc/Objects/listobject.c#L57)
                if len(temp_hashes) > 50:
                    mhd[idx].add_many(temp_hashes)

                    # Sigh, python 2... when it goes away,
                    # we can do `temp_hashes.clear()` instead.
                    del temp_vals[idx]

        # We loop temp_vals again to add any remainder hashes
        # (each list of hashes is smaller than 50 items)
        for sig, vals in temp_vals.items():
            mhd[sig].add_many(vals)

        sigd = {}
        for idx, mh in mhd.items():
            ident = self.idx_to_ident[idx]
            name = self.ident_to_name[ident]
            sigd[idx] = SourmashSignature(mh, name=name)

        debug('=> {} signatures!', len(sigd))
        return sigd
Exemple #7
0
def load_sourmash_stream(fp):
    '''Iteratively parse a JSON file of sourmash
    signatures from the given file.

    Args:
        fp (file): File handle to parse from.
    '''

    backend = ijson.get_backend('yajl2')
    for signature in backend.items(fp, 'item'):
        data = json.dumps([signature], cls=DecimalEncoder).encode('utf-8')

        size = ffi.new("uintptr_t *")
        ptr = rustcall(lib.signatures_load_buffer, data, len(data), False, 0,
                       ffi.NULL, size)
        size = ffi.unpack(size, 1)[0]
        sigs = []
        for i in range(size):
            sigs.append(SourmashSignature._from_objptr(ptr[i]))
        yield sigs[0]
Exemple #8
0
def sketch(args):
    cwd = os.getcwd()
    db_path = os.path.join(cwd, args.name + '.db')
    # check for the existence of the database and tables
    if os.path.exists(db_path):
        pass
    else:
        print(
            "Database file not found. Please make sure the name is correct or run mashpit build."
        )
        exit(0)

    fasta_folder = os.path.join(cwd, 'fasta')
    if os.path.exists(fasta_folder):
        pass
    else:
        print("Fasta folder not found.")
        exit(0)

    sig_file_name = args.name + '.sig'

    all_fasta_path = os.path.join(fasta_folder, "*_skeasa.fasta")
    genomes_list = glob.glob(all_fasta_path)
    minhashes = []
    for genome in genomes_list:
        mh = MinHash(n=1000, ksize=31)
        for record in screed.open(genome):
            mh.add_sequence(record.sequence, True)
        minhashes.append(mh)
    siglist = []

    for i in range(len(minhashes)):
        signame = genomes_list[i].strip(fasta_folder).strip('_skesa.fasta')
        siglist.append(SourmashSignature(minhashes[i], name=signame))
    with open(sig_file_name, 'w') as f:
        save_signatures(siglist, fp=f)
def sketch_database(dict_files, folder, Debug, ksize_n, num_sketch):	
	"""Sketch sequence files
	
	This function generates a sourmash index, also called sketch, of the sequences 
	provided in the folder specified.
	
	For speed reasons, we set force=True in add_sequence step to skip over k-mers containing 
	characters other than ACTG, rather than raising an exception.

	:param dict_files: keys are the names of the files and values are the path to the fasta file
	:param folder:
	:param Debug: True/False to print developing messages.
	:param ksize_n: Kmer size value.
	:param num_sketch: Number of sketches to include in the hash signature. 
	
	:type dict_files: Dictionary
	:type folder: string 
	:type Debug: bool
	:type ksize_n: integer
	:type num_sketch: integet
	
	:returns: List of SourmashSignature signatures (siglist) and absolute path files generated (siglist_file).  
	
	
	.. attention:: The code to implement this API function was taken and adapted from: 
	 
		- https://sourmash.readthedocs.io/en/latest/api-example.html
	
		- https://github.com/dib-lab/sourmash/blob/master/sourmash/commands.py
		
	
	.. seealso:: This function depends on sourmash python module (https://sourmash.readthedocs.io/en/latest/). Some functions employed are:
	
		- :func:`sourmash.MinHash`
		
		- :func:`sourmash.SourmashSignature`
		
		- :func:`sourmash.MinHash.add_sequence`
		
		
	.. include:: ../../links.inc	 
	
	"""
	### Default: set as option
	## num_sketch=5000
	## ksize_n=31
	
	minhashes = {}
	for name,g in dict_files.items():
		print ('\t+ Skecthing sample: ', name)
		E = sourmash.MinHash(n=num_sketch, ksize=ksize_n)	## generate hash according to number of sketches and kmer size
		for record in screed.open(g):
			E.add_sequence(record.sequence, True)
		## in add_sequence and for speed reasons, we set force=True to skip over k-mers containing characters other than ACTG, rather than raising an exception.
		minhashes[name]= E
		
	## Debug messages
	if Debug:
		print (colored("\n*** DEBUG: minhashes *****\n", 'red'))
		print (type(minhashes))	
		print (minhashes)

	siglist = []
	siglist_file = []

	### save as signature
	HCGB_files.create_folder(folder)
	for names,hashes in minhashes.items():
		sig1 = SourmashSignature(hashes, name=names)
		outfile_name = folder + '/' + str(names) + '.sig'
		with open(outfile_name, 'wt') as fp:
			save_signatures([sig1], fp)

		siglist_file.append(outfile_name)
		siglist.append(sig1)
	
	return(siglist_file, siglist)		
Exemple #10
0
 def _convert_signature(sig, msg):
     return SourmashSignature(sig.to_sourmash(),
                              name=f'{msg.sample_name}:{msg.t}',
                              filename=format_filenames(msg.file_names))