def test_bytes_murmur(): x = hash_murmur("ACG") assert x == 1731421407650554201 x = hash_murmur(b"ACG") assert x == 1731421407650554201 x = hash_murmur(u"ACG") assert x == 1731421407650554201
def test_murmur(): x = hash_murmur("ACG") assert x == 1731421407650554201 try: x = hash_murmur() assert 0, "hash_murmur requires an argument" except TypeError: pass x = hash_murmur("ACG", 42) assert x == 1731421407650554201 y = hash_murmur("ACG", 43) assert y != x
def compute_matrix(group_info, group_ident, ksize, output): # first, make a consistently ordered list of all k-mers, and convert # them into hashes. all_kmers = make_all(ksize) all_kmer_hashes = list(set([hash_murmur(i) for i in all_kmers])) all_kmer_hashes.sort() # now, build a matrix of GROUP_N rows x 4**ksize columns, where each # row will be the set of k-mer abundances associated with each group. print('creating', len(group_info), 4**ksize) V = numpy.zeros((len(group_info), 4**ksize), dtype=numpy.uint16) node_id_to_group_idx = {} for i, n in enumerate(group_info): if i % 1000 == 0: print('...', i, len(group_info)) mh = group_info[n] vec = dict(mh.get_mins(with_abundance=True)) vec = [vec.get(hashval, 0) for hashval in all_kmer_hashes] vec = numpy.array(vec) V[i] = vec node_id_to_group_idx[n] = i # save! print('saving matrix of size {} to {}'.format(str(V.shape), output)) with open(output, 'wb') as fp: numpy.save(fp, V) with open(output + '.node_ids', 'wb') as fp: pickle.dump(node_id_to_group_idx, fp) with open(output + '.node_mh', 'wb') as fp: pickle.dump(group_ident, fp)
def main(args=sys.argv[1:]): p = argparse.ArgumentParser() p.add_argument('catlas_prefix', help='catlas prefix') p.add_argument('output') p.add_argument('--maxsize', type=float, default=10000) p.add_argument('--minsize', type=float, default=2000) p.add_argument('--min-abund', type=float, default=0) p.add_argument('-k', '--ksize', default=5, type=int, help='k-mer size for vectors') p.add_argument('--scaled', type=int, default=1000) args = p.parse_args(args) print('minsize: {:g}'.format(args.minsize)) print('maxsize: {:g}'.format(args.maxsize)) print('ksize: {}'.format(args.ksize)) print('min_abund: {}'.format(args.min_abund)) basename = os.path.basename(args.catlas_prefix) catlas = os.path.join(args.catlas_prefix, 'catlas.csv') domfile = os.path.join(args.catlas_prefix, 'first_doms.txt') # load catlas DAG top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag( catlas) print('loaded {} nodes from catlas {}'.format(len(dag), catlas)) # load mapping between dom nodes and cDBG/graph nodes: layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile) print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg))) # find the contigs filename contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz') # ...and catlas node sizes print('loading contig size info') cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info( args.catlas_prefix, min_abund=args.min_abund) node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes( layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes) ### everything is loaded! # find highest nodes with kmer size less than given max_size print('finding terminal nodes for {}.'.format(args.maxsize)) nodes = partition_catlas(dag, top_node_id, node_kmer_sizes, args.maxsize) nodes = {n for n in nodes if node_kmer_sizes[n] > args.minsize} print('{} nodes between {} and {} in k-mer size'.format( len(nodes), args.minsize, args.maxsize)) print('containing {} level1 nodes of {} total'.format( len(find_shadow(nodes, dag)), len(layer1_to_cdbg))) node_kmers = sum([node_kmer_sizes[n] for n in nodes]) print('containing {} kmers of {} total ({:.1f}%)'.format( node_kmers, node_kmer_sizes[top_node_id], node_kmers / node_kmer_sizes[top_node_id] * 100)) ### now build cdbg -> subtree/group ID cdbg_to_group = {} for n in nodes: shadow = find_shadow([n], dag) for level1_node in shadow: for cdbg_id in layer1_to_cdbg[level1_node]: if cdbg_kmer_sizes.get(cdbg_id): assert cdbg_id not in cdbg_to_group cdbg_to_group[cdbg_id] = n # record group info - here we are using the MinHash class to track # k-mer abundances in group_info, as well as using group_ident to # to track k=31 MinHashes for identification of each group. group_info = {} group_ident = {} for n in nodes: group_info[n] = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1, track_abundance=1) group_ident[n] = sourmash_lib.MinHash(n=0, ksize=31, scaled=args.scaled) # aaaaaand iterate over contigs, collecting abundances from all contigs # in a group. for record_n, record in enumerate(screed.open(contigs)): if record_n % 10000 == 0: print('...', record_n, end='\r') cdbg_id = int(record.name) group_id = cdbg_to_group.get(cdbg_id) # if this is under a node that meets minsize criteria, track: if group_id is not None: # keep/measure abundances! mh = group_info[group_id] mh.add_sequence(record.sequence, True) # update group idents. group_ident[group_id].add_sequence(record.sequence, True) # ok, now we have a pile of k-mer vectors of size 4**args.ksize; # output in numpy format. # first, make a consistently ordered list of all k-mers, and convert # them into hashes. all_kmers = make_all(args.ksize) all_kmer_hashes = list(set([hash_murmur(i) for i in all_kmers])) all_kmer_hashes.sort() # now, build a matrix of GROUP_N rows x 4**ksize columns, where each # row will be the set of k-mer abundances associated with each group. print('creating', len(group_info), 4**args.ksize) V = numpy.zeros((len(group_info), 4**args.ksize), dtype=numpy.uint16) node_id_to_group_idx = {} for i, n in enumerate(group_info): if i % 1000 == 0: print('...', i, len(group_info)) mh = group_info[n] vec = dict(mh.get_mins(with_abundance=True)) vec = [vec.get(hashval, 0) for hashval in all_kmer_hashes] vec = numpy.array(vec) V[i] = vec node_id_to_group_idx[n] = i # save! print('saving matrix of size {} to {}'.format(str(V.shape), args.output)) with open(args.output, 'wb') as fp: numpy.save(fp, V) with open(args.output + '.node_ids', 'wb') as fp: pickle.dump(node_id_to_group_idx, fp) with open(args.output + '.node_mh', 'wb') as fp: pickle.dump(group_ident, fp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('genomes', nargs='+') parser.add_argument('-o', '--output') parser.add_argument('-k', '--ksize', default=5, type=int, help='k-mer size for vectors') args = parser.parse_args() assert args.output, "please specify -o" n = 0 genome_n = 0 group_info = {} group_ident = {} labels = {} node_id_to_group_idx = {} for genome in args.genomes: print(genome) genome_n += 1 for record in screed.open(genome): for start in range(0, len(record.sequence), SIZE): mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1, track_abundance=1) mh.add_sequence(record.sequence[start:start+SIZE], True) group_info[n] = mh mh = sourmash_lib.MinHash(n=0, ksize=31, scaled=1000) mh.add_sequence(record.sequence[start:start+SIZE], True) group_ident[n] = mh labels[n] = genome_n node_id_to_group_idx[n] = n n += 1 # ok, now we have a pile of k-mer vectors of size 4**args.ksize; # output in numpy format. # first, make a consistently ordered list of all k-mers, and convert # them into hashes. all_kmers = make_all(args.ksize) all_kmer_hashes = list(set([ hash_murmur(i) for i in all_kmers ])) all_kmer_hashes.sort() # now, build a matrix of GROUP_N rows x 4**ksize columns, where each # row will be the set of k-mer abundances associated with each group. V = numpy.zeros((len(group_info), 4**args.ksize), dtype=numpy.uint16) for i, n in enumerate(group_info): mh = group_info[n] vec = dict(mh.get_mins(with_abundance=True)) vec = [ vec.get(hashval,0) for hashval in all_kmer_hashes ] vec = numpy.array(vec) V[i] = vec # save! print('saving matrix of size {} to {}'.format(str(V.shape), args.output)) with open(args.output, 'wb') as fp: numpy.save(fp, V) with open(args.output + '.labels', 'wb') as fp: dump(labels, fp) with open(args.output + '.node_ids', 'wb') as fp: pickle.dump(node_id_to_group_idx, fp) with open(args.output + '.node_mh', 'wb') as fp: pickle.dump(group_ident, fp)