def test_dna_mh(track_abundance): e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCAG' e1.add_sequence(seq) for i in range(len(seq) - 3): e2.add(seq[i:i + 4]) assert e1.get_mins() == e2.get_mins() print(e1.get_mins()) assert 726311917625663847 in e1.get_mins() assert 3697418565283905118 in e1.get_mins()
def test_protein_mh(track_abundance): e1 = MinHash(n=5, ksize=6, is_protein=True, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=6, is_protein=True, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCG' e1.add_sequence(seq) for i in range(len(seq) - 5): kmer = seq[i:i + 6] e2.add(kmer) assert e1.get_mins() == e2.get_mins() assert 901193879228338100 in e1.get_mins()
def test_pickle(track_abundance): import pickle from io import BytesIO e1 = MinHash(n=5, ksize=6, is_protein=False, track_abundance=track_abundance) seq = 'ATGGCAGTGACGATGCCG' e1.add_sequence(seq) e1.add_sequence(seq) fp = BytesIO() pickle.dump(e1, fp) fp2 = BytesIO(fp.getvalue()) e2 = pickle.load(fp2) assert e1.get_mins(with_abundance=track_abundance) == \ e2.get_mins(with_abundance=track_abundance) assert e1.num == e2.num assert e1.ksize == e2.ksize assert e1.is_protein == e2.is_protein assert e1.max_hash == e2.max_hash assert e1.seed == e2.seed
def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size): a = MinHash(sketch_size, 10, track_abundance=True) oracle = dict(zip(hashes, abundances)) a.set_abundances(oracle) mins = a.get_mins(with_abundance=True) size = min(sum(1 for v in oracle.values() if v > 0), sketch_size) assert len(mins) == size for k, v in mins.items(): assert oracle[k] == v
def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled): a = MinHash(0, 10, track_abundance=True, scaled=scaled) oracle = dict(zip(hashes, abundances)) a.set_abundances(oracle) max_hash = get_max_hash_for_scaled(scaled) below_max_hash = sum(1 for (k, v) in oracle.items() if k <= max_hash and v > 0) mins = a.get_mins(with_abundance=True) assert len(mins) == below_max_hash for k, v in mins.items(): assert oracle[k] == v assert k <= max_hash assert v > 0
def main(argv): p = argparse.ArgumentParser(description=__doc__) p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('mh_index_picklefile', help='pickled hashval index') p.add_argument('hashval_list', help='file with list of hashvals') p.add_argument('output') p.add_argument('-k', '--ksize', default=31, type=int, help='k-mer size (default: 31)') p.add_argument('--scaled', default=1000, type=float, help="scaled value for contigs minhash output") p.add_argument('-v', '--verbose', action='store_true') args = p.parse_args(argv) # create output directory if it doesn't exist. outdir = args.output notify('putting output in {}', outdir) os.makedirs(os.path.join(outdir, "contigs"), exist_ok=True) if not os.path.isdir(outdir): error("output '{}' is not a directory and cannot be made", outdir) sys.exit(-1) # load picklefile with open(args.mh_index_picklefile, 'rb') as fp: hashval_to_contig_id = pickle.load(fp) notify('loaded {} hash value -> cdbg_id mappings from {}', len(hashval_to_contig_id), args.mh_index_picklefile) # load list of desired hashvals hashvals = [int(x.strip()) for x in open(args.hashval_list, 'rt')] hashvals = set(hashvals) notify('loaded {} search hashvalues from {}', len(hashvals), args.hashval_list) if not len(hashvals): print('No hash values to search!', file=sys.stderr) sys.exit(-1) # load catlas DAG catlas = CAtlas(args.catlas_prefix) notify('loaded {} nodes from catlas {}', len(catlas), args.catlas_prefix) notify('loaded {} layer 1 catlas nodes', len(catlas.layer1_to_cdbg)) # find the contigs filename contigs_file = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # get a single ksize & scaled ksize = int(args.ksize) scaled = int(args.scaled) # record command line with open(os.path.join(outdir, 'command.txt'), 'wt') as fp: fp.write(str(sys.argv)) fp.write("\n") # output results.csv in the output directory: csvoutfp = open(os.path.join(outdir, 'hashval_results.csv'), 'wt') csv_writer = csv.writer(csvoutfp) csv_writer.writerow(['hashval', 'bp', 'contigs']) # iterate over each query, do the thing. n_found = 0 for hashval in hashvals: notify('----') notify('QUERY HASHVAL: {}', hashval) mh = MinHash(0, ksize, scaled=scaled) result = execute_query(hashval, catlas, hashval_to_contig_id, mh=mh) notify('done searching!') if not result: notify("no result for hashval {}", hashval) continue result.retrieve_contigs(contigs_file) result.write(csv_writer, csvoutfp, outdir) assert hashval in mh.get_mins() n_found += 1 # end main loop! notify('----') notify("Done! Found {} hashvals of {} in {} with k={}", n_found, len(hashvals), args.catlas_prefix, ksize) notify("Results are in directory '{}'", outdir) return 0