def test_load_graph_1(): in1 = utils.get_test_data('test-abund-read-2.fa') out1 = utils.get_temp_filename('out.ct') cmd = """ cat {in1} | {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \ 2> /dev/null """ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1) print(cmd) run_shell_cmd(cmd) assert os.path.exists(out1) khmer.load_nodegraph(out1)
def data(self): if self._data is None: if self._filename is None: self._data = self._factory() else: self._data = khmer.load_nodegraph(self._filename) return self._data
def load(sbt_fn): with open(sbt_fn) as fp: nodes = json.load(fp) if nodes[0] is None: # TODO error! raise ValueError("Empty tree!") sbt_nodes = [] ksize, tablesize, ntables, _, _, _ = khmer.extract_nodegraph_info(nodes[0]["filename"]) factory = GraphFactory(ksize, tablesize, ntables) for node in nodes: if node is None: sbt_nodes.append(None) continue graph = khmer.load_nodegraph(node["filename"]) if "metadata" in node: # only Leaf nodes have metadata l = Leaf(node["metadata"], graph) sbt_nodes.append(l) else: n = Node(factory, name=node["name"]) n.graph = graph sbt_nodes.append(n) tree = SBT(factory) tree.nodes = sbt_nodes return tree
def test_save_load(tabletype): kh = tabletype(5, PRIMES_1m) savefile = utils.get_temp_filename('tablesave.out') # test add(dna) x = kh.add("ATGGC") z = kh.get("ATGGC") assert z == 1 kh.save(savefile) # should we provide a single load function here? yes, probably. @CTB if tabletype == _Countgraph: loaded = khmer.load_countgraph(savefile) elif tabletype == _Counttable: loaded = khmer.load_counttable(savefile) elif tabletype == _SmallCountgraph: loaded = khmer.load_countgraph(savefile, small=True) elif tabletype == _SmallCounttable: loaded = khmer.load_counttable(savefile, small=True) elif tabletype == _Nodegraph: loaded = khmer.load_nodegraph(savefile) elif tabletype == _Nodetable: loaded = khmer.load_nodetable(savefile) else: raise Exception("unknown tabletype") z = loaded.get('ATGGC') assert z == 1
def test_load_graph_1(): in1 = utils.get_test_data("test-abund-read-2.fa") out1 = utils.get_temp_filename("out.ct") cmd = """ cat {in1} | {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \ 2> /dev/null """ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1) print(cmd) (status, out, err) = run_shell_cmd(cmd) assert os.path.exists(out1) khmer.load_nodegraph(out1)
def test_hashbits_file_version_check(): inpath = utils.get_test_data('badversion-k12.htable') try: nodegraph = khmer.load_nodegraph(inpath) assert 0, "this should fail" except OSError as e: print(str(e))
def test_nodegraph_file_type_check(): kh = khmer._Countgraph(12, [1]) savepath = utils.get_temp_filename('tempcountingsave0.ct') kh.save(savepath) try: nodegraph = khmer.load_nodegraph(savepath) assert 0, "this should fail" except OSError as e: print(str(e))
def data(self): if self._data is None: data = self.storage.load(self._path) # We need to do this tempfile dance because khmer only load # data from files. with NamedTemporaryFile(suffix=".gz") as f: f.write(data) f.file.flush() self._data = khmer.load_nodegraph(f.name) return self._data
def main(): info('make-initial-stoptags.py', ['graph']) args = sanitize_help(get_parser()).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) print('loading nodegraph %s.pt' % graphbase, file=sys.stderr) nodegraph = khmer.load_nodegraph(graphbase) # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) nodegraph.load_tagset(graphbase + '.tagset') counting = khmer_args.create_countgraph(args) # divide up into SUBSET_SIZE fragments divvy = nodegraph.divide_tags_into_subsets(args.subset_size) divvy = list(divvy) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print('doing pre-partitioning from', start, 'to', end, file=sys.stderr) subset = nodegraph.do_subset_partition(start, end) # now, repartition... print('repartitioning to find HCKs.', file=sys.stderr) nodegraph.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('saving stop tags', file=sys.stderr) nodegraph.save_stop_tags(graphbase + '.stoptags') print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
def main(): info('make-initial-stoptags.py', ['graph']) args = sanitize_help(get_parser()).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) print('loading nodegraph %s.pt' % graphbase, file=sys.stderr) nodegraph = khmer.load_nodegraph(graphbase) # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) nodegraph.load_tagset(graphbase + '.tagset') counting = khmer_args.create_countgraph(args) # divide up into SUBSET_SIZE fragments divvy = nodegraph.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print('doing pre-partitioning from', start, 'to', end, file=sys.stderr) subset = nodegraph.do_subset_partition(start, end) # now, repartition... print('repartitioning to find HCKs.', file=sys.stderr) nodegraph.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('saving stop tags', file=sys.stderr) nodegraph.save_stop_tags(graphbase + '.stoptags') print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
def test_n_occupied_save_load(): filename = utils.get_test_data('random-20-a.fa') nodegraph = khmer.Nodegraph(20, 100000, 3) for _, record in enumerate(screed.open(filename)): nodegraph.consume(record.sequence) assert nodegraph.n_occupied() == 3884 assert nodegraph.n_unique_kmers() == 3960 savefile = utils.get_temp_filename('out') nodegraph.save(savefile) ng2 = khmer.load_nodegraph(savefile) assert ng2.n_occupied() == 3884, ng2.n_occupied() assert ng2.n_unique_kmers() == 0 # this is intended behavior, sigh.
def load_node(node_dict, factory): graph = khmer.load_nodegraph(node_dict['filename']) if 'metadata' in node_dict: # must be a leaf return Leaf(node_dict['metadata'], node_dict['name'], graph) else: node = Node(factory) node.graph = graph left = node_dict['left'] node.subnodes.append(load_node(left, factory)) right = node_dict['right'] node.subnodes.append(load_node(right, factory)) node.children = node_dict['children'] node.name = node_dict['name'] return node
def main(): parser = argparse.ArgumentParser( description= "This script creates a CSV file of similarity indicies between the" " input file and each of the sketches in the training/reference file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-f', '--force', action="store_true", help="Force creation of new NodeGraph.") parser.add_argument('-fp', '--fp_rate', type=restricted_float, help="False positive rate.", default=0.0001) parser.add_argument( '-ct', '--containment_threshold', type=restricted_float, help="Only return results with containment index above this value", default=0.02) parser.add_argument( '-c', '--confidence', type=restricted_float, help= "Desired probability that all results were returned with containment index above threshold [-ct]", default=0.95) parser.add_argument( '-ng', '--node_graph', help="NodeGraph/bloom filter location. Used if it exists; if not, one " "will be created and put in the same directory as the specified " "output CSV file.", default=None) parser.add_argument( '-b', '--base_name', action="store_true", help= "Flag to indicate that only the base names (not the full path) should be saved in the output CSV file" ) parser.add_argument( '-i', '--intersect_nodegraph', action="store_true", help= "Option to only insert query k-mers in bloom filter if they appear anywhere in the training" " database. Note that the Jaccard estimates will now be " "J(query intersect union_i training_i, training_i) instead of J(query, training_i), " "but will use significantly less space.") parser.add_argument('in_file', help="Input file: FASTQ/A file (can be gzipped).") parser.add_argument( 'training_data', help= "Training/reference data (HDF5 file created by MakeTrainingDatabase.py)" ) parser.add_argument('out_csv', help='Output CSV file') # Parse and check args args = parser.parse_args() base_name = args.base_name training_data = os.path.abspath(args.training_data) if not os.path.exists(training_data): raise Exception("Training/reference file %s does not exist." % training_data) # Let's get the k-mer sizes in the training database ksizes = set() # Import all the training data sketches = MH.import_multiple_from_single_hdf5(training_data) # Check for issues with the sketches (can also check if all the kmers make sense (i.e. no '' or non-ACTG characters)) if sketches[0]._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again." ) num_hashes = len(sketches[0]._kmers) for i in range(len(sketches)): sketch = sketches[i] if sketch._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again." ) if len(sketch._kmers) != num_hashes: raise Exception("Unequal number of hashes for sketch of %s" % sketch.input_file_name) ksizes.add(sketch.ksize) if len(ksizes) > 1: raise Exception( "Training/reference data uses different k-mer sizes. Culprit was %s." % (sketch.input_file_name)) # Get the appropriate k-mer size ksize = ksizes.pop() # Get number of threads to use num_threads = args.threads # Check and parse the query file query_file = os.path.abspath(args.in_file) if not os.path.exists(query_file): raise Exception("Query file %s does not exist." % query_file) # Node graph is stored in the output folder with name <InputFASTQ/A>.NodeGraph.K<k_size> if args.node_graph is None: # If no node graph is specified, create one node_graph_out = os.path.join( os.path.dirname(os.path.abspath(args.out_csv)), os.path.basename(query_file) + ".NodeGraph.K" + str(ksize)) if not os.path.exists( node_graph_out ): # Don't complain if the default location works print("Node graph not provided (via -ng). Creating one at: %s" % node_graph_out) elif os.path.exists( args.node_graph): # If one is specified and it exists, use it node_graph_out = args.node_graph else: # Otherwise, the specified one doesn't exist raise Exception("Provided NodeGraph %s does not exist." % args.node_graph) # import and check the intersect nodegraph if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( training_data)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None intersect_nodegraph = None if intersect_nodegraph_file is not None: if not os.path.exists(intersect_nodegraph_file): raise Exception( "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag." ) try: intersect_nodegraph = khmer.load_nodegraph( intersect_nodegraph_file) if intersect_nodegraph.ksize() != ksize: raise Exception( "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d" % (intersect_nodegraph_file, intersect_nodegraph.ksize(), ksize)) except: raise Exception("Could not load given intersect nodegraph %s" % intersect_nodegraph_file) results_file = os.path.abspath(args.out_csv) force = args.force fprate = args.fp_rate coverage_threshold = args.containment_threshold # desired coverage cutoff confidence = args.confidence # desired confidence that you got all the organisms with coverage >= desired coverage # Get names of training files for use as rows in returned tabular data training_file_names = [] for i in range(len(sketches)): training_file_names.append(sketches[i].input_file_name) # Only form the Nodegraph if we need to global sample_kmers if not os.path.exists(node_graph_out) or force is True: hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(query_file) full_kmer_count_estimate = hll.estimate_cardinality() res = optimal_size(full_kmer_count_estimate, fp_rate=fprate) if intersect_nodegraph is None: # If no intersect list was given, just populate the bloom filter sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) #sample_kmers.consume_seqfile(query_file) rparser = khmer.ReadParser(query_file) threads = [] for _ in range(num_threads): cur_thrd = threading.Thread( target=sample_kmers.consume_seqfile_with_reads_parser, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() else: # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training) # instead of J(query, training) # (TODO: fix this after khmer is updated) #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers() # Doesnt work due to khmer bug intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied( ) # Not technically correct, but I need to wait until khmer is updated if intersect_nodegraph_kmer_count < full_kmer_count_estimate: # At max, we have as many k-mers as in the union of the training database (But makes this always return 0) res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate) sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) else: sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for record in screed.open(query_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] if intersect_nodegraph.get(kmer) > 0: sample_kmers.add(kmer) # Save the sample_kmers sample_kmers.save(node_graph_out) true_fprate = khmer.calc_expected_collisions(sample_kmers, max_false_pos=0.99) else: sample_kmers = khmer.load_nodegraph(node_graph_out) node_ksize = sample_kmers.ksize() if node_ksize != ksize: raise Exception( "Node graph %s has wrong k-mer size of %d (input was %d). Try --force or change -k." % (node_graph_out, node_ksize, ksize)) true_fprate = khmer.calc_expected_collisions(sample_kmers, max_false_pos=0.99) #num_sample_kmers = sample_kmers.n_unique_kmers() # For some reason this only works when creating a new node graph, use the following instead num_sample_kmers = sample_kmers.n_occupied() # Compute all the indicies for all the training data pool = Pool(processes=num_threads) res = pool.map( unwrap_compute_indicies, zip(sketches, repeat(num_sample_kmers), repeat(true_fprate))) # Gather up the results in a nice form intersection_cardinalities = np.zeros(len(sketches)) containment_indexes = np.zeros(len(sketches)) jaccard_indexes = np.zeros(len(sketches)) for i in range(len(res)): (intersection_cardinality, containment_index, jaccard_index) = res[i] intersection_cardinalities[i] = intersection_cardinality containment_indexes[i] = containment_index jaccard_indexes[i] = jaccard_index d = { 'intersection': intersection_cardinalities, 'containment index': containment_indexes, 'jaccard index': jaccard_indexes } # Use only the basenames to label the rows (if requested) if base_name is True: df = pd.DataFrame(d, map(os.path.basename, training_file_names)) else: df = pd.DataFrame(d, training_file_names) # Only get the rows above a certain threshold if coverage_threshold <= 0: est_threshold = 0 else: est_threshold = threshold_calc(num_hashes, coverage_threshold, fprate, confidence) filtered_results = df[df['containment index'] > est_threshold].sort_values( 'containment index', ascending=False) # Export the results filtered_results.to_csv(results_file, index=True, encoding='utf-8')
def load(info, dirname): new_node = Node(info['factory'], name=info['name']) filename = os.path.join(dirname, info['filename']) new_node.data = khmer.load_nodegraph(filename) return new_node
def main(): info('find-knots.py', ['graph']) parser = get_parser() parser.epilog = parser.epilog.replace( ":doc:`partitioning-big-data`", "http://khmer.readthedocs.org/en/stable/user/" "partitioning-big-data.html") args = sanitize_help(parser).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr) graph = khmer.load_nodegraph(graphbase) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) graph.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) graph.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) print('---', file=sys.stderr) print('output stoptags will be in', graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: print('(these output stoptags will include the already-loaded set)', file=sys.stderr) print('---', file=sys.stderr) # create countgraph ksize = graph.ksize() counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): print('<-', subset_file, file=sys.stderr) subset = graph.load_subset_partitionmap(subset_file) print('** repartitioning subset... %s' % subset_file, file=sys.stderr) graph.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** merging subset... %s' % subset_file, file=sys.stderr) graph.merge_subset(subset) print('** repartitioning, round 2... %s' % subset_file, file=sys.stderr) size = graph.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** repartitioned size:', size, file=sys.stderr) print('saving stoptags binary', file=sys.stderr) graph.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) print('done!', file=sys.stderr)
def load(info): data = khmer.load_nodegraph(info['filename']) return Leaf(info['metadata'], data, name=info['name'])
def load(info): new_node = Node(info['factory'], name=info['name']) new_node.data = khmer.load_nodegraph(info['filename']) return new_node
def load(info, dirname): filepath = os.path.join(dirname, info['filename']) data = khmer.load_nodegraph(filepath) return Leaf(info['metadata'], data, name=info['name'])
def main(): info('partition-graph.py', ['graph']) args = sanitize_help(get_parser()).parse_args() basename = args.basename filenames = [basename, basename + '.tagset'] for _ in filenames: check_input_files(_, args.force) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) if args.stoptags: print('stoptag file:', args.stoptags, file=sys.stderr) print('--', file=sys.stderr) print('loading nodegraph %s' % basename, file=sys.stderr) nodegraph = load_nodegraph(basename) nodegraph.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps:', ' stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print('starting %d threads' % n_threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (basename, ), file=sys.stderr)
def main(): info('partition-graph.py', ['graph']) args = sanitize_help(get_parser()).parse_args() basename = args.basename filenames = [basename, basename + '.tagset'] for _ in filenames: check_input_files(_, args.force) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) if args.stoptags: print('stoptag file:', args.stoptags, file=sys.stderr) print('--', file=sys.stderr) print('loading nodegraph %s' % basename, file=sys.stderr) nodegraph = load_nodegraph(basename) nodegraph.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps:', ' stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print('starting %d threads' % n_threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (basename,), file=sys.stderr)
def data(self): if self._data is None: # TODO: what if self._filename is None? self._data = khmer.load_nodegraph(self._filename) return self._data
def main(): parser = get_parser() parser.epilog = parser.epilog.replace( ":doc:`partitioning-big-data`", "http://khmer.readthedocs.io/en/stable/user/" "partitioning-big-data.html" ) args = sanitize_help(parser).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr) graph = khmer.load_nodegraph(graphbase) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) graph.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) graph.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) print('---', file=sys.stderr) print('output stoptags will be in', graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: print( '(these output stoptags will include the already-loaded set)', file=sys.stderr) print('---', file=sys.stderr) # create countgraph ksize = graph.ksize() counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): print('<-', subset_file, file=sys.stderr) subset = graph.load_subset_partitionmap(subset_file) print('** repartitioning subset... %s' % subset_file, file=sys.stderr) graph.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** merging subset... %s' % subset_file, file=sys.stderr) graph.merge_subset(subset) print('** repartitioning, round 2... %s' % subset_file, file=sys.stderr) size = graph.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** repartitioned size:', size, file=sys.stderr) print('saving stoptags binary', file=sys.stderr) graph.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) print('done!', file=sys.stderr)
def main(): parser = argparse.ArgumentParser( description= "This script will create node graph for a given k-mer size and query file (can be used as input to QueryDNADatabase.py)", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-fp', '--fp_rate', type=restricted_float, help="False positive rate.", default=0.0001) parser.add_argument( '-i', '--intersect_nodegraph', help= "Location of Node Graph. Will only insert query k-mers in bloom filter if they appear anywhere in the training" " database. Note that the Jaccard estimates will now be " "J(query intersect union_i training_i, training_i) instead of J(query, training_i), " "but will use significantly less space (unfortunately will also disable threading)." ) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('in_file', help="Input file: FASTQ/A file (can be gzipped).") parser.add_argument('out_dir', help='Output directory') # Parse and check args args = parser.parse_args() query_file = os.path.abspath(args.in_file) ksize = args.k_size num_threads = args.threads node_graph_out = os.path.join( os.path.abspath(args.out_dir), os.path.basename(query_file) + ".NodeGraph.K" + str(ksize)) if args.intersect_nodegraph is not None: intersect_nodegraph_file = args.intersect_nodegraph else: intersect_nodegraph_file = None intersect_nodegraph = None if intersect_nodegraph_file is not None: if not os.path.exists(intersect_nodegraph_file): raise Exception( "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag." ) try: intersect_nodegraph = khmer.load_nodegraph( intersect_nodegraph_file) if intersect_nodegraph.ksize() != ksize: raise Exception( "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d" % (intersect_nodegraph_file, intersect_nodegraph.ksize(), ksize)) except: raise Exception("Could not load given intersect nodegraph %s" % intersect_nodegraph_file) fprate = args.fp_rate hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(query_file) full_kmer_count_estimate = hll.estimate_cardinality() res = optimal_size(full_kmer_count_estimate, fp_rate=fprate) if intersect_nodegraph is None: # If no intersect list was given, just populate the bloom filter sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) #sample_kmers.consume_seqfile(query_file) rparser = khmer.ReadParser(query_file) threads = [] for _ in range(num_threads): cur_thrd = threading.Thread( target=sample_kmers.consume_seqfile_with_reads_parser, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() else: # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training) # instead of J(query, training) # (TODO: fix this after khmer is updated) #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers() # Doesnt work due to khmer bug intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied( ) # Doesnt work due to khmer bug if intersect_nodegraph_kmer_count < full_kmer_count_estimate: # At max, we have as many k-mers as in the union of the training database (But makes this always return 0) res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate) sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) else: sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for record in screed.open(query_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] if intersect_nodegraph.get(kmer) > 0: sample_kmers.add(kmer) # Save the sample_kmers sample_kmers.save(node_graph_out)