def test_assemble_left_double_fork(self, left_double_fork_structure): # assemble entire contig + branch points b/c of labels; start from end graph, contig, L, HDN, R, branch = left_double_fork_structure lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) # first try without the labels paths = asm.assemble(contig[-K:]) assert len(paths) == 1 # without labels, should get the beginning of the HDN thru the end assert paths[0] == contig[HDN.pos:] # now add labels and check that we get two full length paths hdn = graph.find_high_degree_nodes(contig) hdn += graph.find_high_degree_nodes(branch) print(list(hdn)) lh.label_across_high_degree_nodes(contig, hdn, 1) lh.label_across_high_degree_nodes(branch, hdn, 2) print(lh.get_tag_labels(list(hdn)[0])) paths = asm.assemble(contig[-K:]) assert len(paths) == 2 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, branch) for path in paths)
def test_hash_as_seed(self, linear_structure): graph, contig = linear_structure lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) left = graph.hash(contig[:K]) assert utils._equals_rc(asm.assemble(left).pop(), contig)
def test_assemble_tandem_repeats(self, tandem_repeat_structure): # assemble one copy of a tandem repeat graph, repeat, tandem_repeats = tandem_repeat_structure lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) paths = asm.assemble(repeat[:K]) assert len(paths) == 1 # There are K-1 k-mers spanning the junction between # the beginning and end of the repeat assert len(paths[0]) == len(repeat) + K - 1
def test_assemble_snp_bubble_single(self, snp_bubble_structure): # assemble entire contig + one of two paths through a bubble graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(wildtype) assert len(hdn) == 2 lh.label_across_high_degree_nodes(wildtype, hdn, 1) paths = asm.assemble(wildtype[:K]) assert len(paths) == 1 assert utils._equals_rc(paths[0], wildtype)
def test_beginning_to_end_across_tip(self, right_tip_structure): # assemble entire contig, ignoring branch point b/c of labels graph, contig, L, HDN, R, tip = right_tip_structure lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(contig) # L, HDN, and R will be labeled with 1 lh.label_across_high_degree_nodes(contig, hdn, 1) path = asm.assemble(contig[:K]) assert len(path) == 1, "there should only be one path" path = path[0] # @CTB assert len(path) == len(contig) assert utils._equals_rc(path, contig)
def test_assemble_snp_bubble_both(self, snp_bubble_structure): # assemble entire contig + both paths graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(wildtype) hdn += graph.find_high_degree_nodes(mutant) assert len(hdn) == 2 lh.label_across_high_degree_nodes(wildtype, hdn, 1) lh.label_across_high_degree_nodes(mutant, hdn, 2) paths = asm.assemble(wildtype[:K]) assert len(paths) == 2 assert any(utils._contains_rc(wildtype, path) for path in paths) assert any(utils._contains_rc(mutant, path) for path in paths)
def test_assemble_right_double_fork(self, right_double_fork_structure): # assemble two contigs from a double forked structure graph, contig, L, HDN, R, branch = right_double_fork_structure lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(contig) hdn += graph.find_high_degree_nodes(branch) print(list(hdn)) lh.label_across_high_degree_nodes(contig, hdn, 1) lh.label_across_high_degree_nodes(branch, hdn, 2) print(lh.get_tag_labels(list(hdn)[0])) paths = asm.assemble(contig[:K]) print('Path lengths', [len(x) for x in paths]) assert len(paths) == 2 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, branch) for path in paths)
def test_assemble_snp_bubble_stopbf(self, snp_bubble_structure): # assemble one side of bubble, blocked with stop_filter, # when labels on both branches # stop_filter should trip a filter failure, negating the label spanning graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure stop_filter = khmer.Nodegraph(K, 1e5, 4) lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh, stop_filter=stop_filter) hdn = graph.find_high_degree_nodes(wildtype) hdn += graph.find_high_degree_nodes(mutant) assert len(hdn) == 2 lh.label_across_high_degree_nodes(wildtype, hdn, 1) lh.label_across_high_degree_nodes(mutant, hdn, 2) # do the labeling, but block the mutant with stop_filter stop_filter.count(mutant[HDN_L.pos + 1:HDN_L.pos + K + 1]) paths = asm.assemble(wildtype[:K]) assert len(paths) == 1 assert any(utils._equals_rc(path, wildtype) for path in paths)
def test_assemble_right_triple_fork(self, right_triple_fork_structure): # assemble three contigs from a trip fork (graph, contig, L, HDN, R, top_sequence, bottom_sequence) = right_triple_fork_structure lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(contig) hdn += graph.find_high_degree_nodes(top_sequence) hdn += graph.find_high_degree_nodes(bottom_sequence) print(list(hdn)) lh.label_across_high_degree_nodes(contig, hdn, 1) lh.label_across_high_degree_nodes(top_sequence, hdn, 2) lh.label_across_high_degree_nodes(bottom_sequence, hdn, 3) print(lh.get_tag_labels(list(hdn)[0])) paths = asm.assemble(contig[:K]) print([len(x) for x in paths]) assert len(paths) == 3 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, top_sequence) for path in paths) assert any(utils._equals_rc(path, bottom_sequence) for path in paths)
def main(): #info('sweep-files.py', ['sweep']) parser = sanitize_help(get_parser()) args = parser.parse_args() if args.max_tablesize < MIN_HSIZE: args.max_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, graphtype='nodegraph') K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables traversal_range = args.traversal_range outputs = {} # Consume the database files and assign each a unique label in the # de Bruin graph; open a file and output queue for each file as well. ht = khmer.GraphLabels(K, HT_SIZE, N_HT) try: print('consuming and labeling input sequences...', file=sys.stderr) for i, dbfile in enumerate(args.db): name = args.output_prefix + os.path.basename(dbfile) outfp = open(os.path.join(args.outdir, name) + '.sweep', 'wb') outq = IODeque(args.max_queue_size, outfp) outputs[i] = outq for n, record in enumerate(screed.open(dbfile)): if n % 50000 == 0: print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, i) except (IOError, OSError) as e: print('!! ERROR: !!', e, file=sys.stderr) print('...error setting up outputs. exiting...', file=sys.stderr) print('done consuming input sequence. \ added {t} tags and {l} labels...' \ .format(t=ht.n_tags(), l=ht.n_labels()), file=sys.stderr) n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 # Iterate through all the reads and check for the labels with which they # intersect. Queue to the corresponding label when found. for read_file in args.query: print('** sweeping {read_file} for labels...'.format( read_file=read_file), file=sys.stderr) try: read_fp = screed.open(read_file) except IOError as error: print('!! ERROR: !!', error, file=sys.stderr) print('*** Could not open {fn}, skipping...'.format(fn=read_file), file=sys.stderr) else: for n, record in enumerate(read_fp): if n % 50000 == 0 and n > 0: print('\tswept {n} reads [{nc} labeled, {no} orphaned]' \ .format(n=n, nc=n_labeled, no=n_orphaned), file=sys.stderr) seq = record.sequence try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: # sweep_label_neighborhood throws a ValueError when # len(seq) < K. just catch it and move on. pass else: if labels: n_labeled += 1 if len(labels) > 1: n_mlabeled += 1 for label in labels: outputs[label].append(record) else: n_orphaned += 1 print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr) read_fp.close() # gotta output anything left in the buffers at the end! print('** End of run...', file=sys.stderr) for q in list(outputs.values()): q.clear() print('swept {n_reads}...'.format(n_reads=n_labeled + n_orphaned), file=sys.stderr) print('...with {nc} labeled and {no} orphaned'.format(nc=n_labeled, no=n_orphaned), file=sys.stderr) print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr)
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = sanitize_help(get_parser()) args = parser.parse_args() if args.max_tablesize < MAX_HSIZE: args.max_tablesize = MAX_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, graphtype='nodegraph') K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_input_files(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = next(ix) del ix extension = 'fa' if hasattr(record, 'quality'): # fastq! extension = 'fq' output_buffer = ReadBufferManager( max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.GraphLabels(K, HT_SIZE, N_HT) try: print('consuming input sequences...', file=sys.stderr) if args.label_by_pid: print('...labeling by partition id (pid)', file=sys.stderr) ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print('...labeling by sequence', file=sys.stderr) for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print('...labeling to create groups of size {s}'.format( s=args.group_size), file=sys.stderr) label = -1 g = 0 try: outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension ), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open('{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print('...consumed {n} sequences...'.format(n=n), file=sys.stderr) ht.consume_sequence_and_tag_with_labels(record.sequence, label) write_record(record, outfp) except (IOError, OSError) as e: print('!! ERROR !!', e, file=sys.stderr) print('...error splitting input. exiting...', file=sys.stderr) except (IOError, OSError) as e: print('!! ERROR: !!', e, file=sys.stderr) print('...error consuming \ {i}. exiting...'.format(i=input_fastp), file=sys.stderr) print('done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.graph.n_tags(), l=ht.n_labels())) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print('** sweeping {read_file} for labels...'.format( read_file=read_file), file=sys.stderr) file_t = 0.0 try: read_fp = screed.open(read_file) except (IOError, OSError) as error: print('!! ERROR: !!', error, file=sys.stderr) print('*** Could not open {fn}, skipping...'.format( fn=read_file), file=sys.stderr) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print('\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t), file=sys.stderr) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'quality'): seq_str = fmt_fastq(name, seq, record.quality, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print('** End of run...', file=sys.stderr) output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print('! WARNING: Sweep finished with errors !', file=sys.stderr) print('** {writee} reads not written'.format( writee=output_buffer.num_write_errors), file=sys.stderr) print('** {filee} errors opening files'.format( filee=output_buffer.num_file_errors), file=sys.stderr) print('swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned), file=sys.stderr) print('...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned), file=sys.stderr) print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr) print('** outputting label number distribution...', file=sys.stderr) fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'w', encoding='utf-8') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print('** outputting label read counts...', file=sys.stderr) with open(fn, 'w', encoding='utf-8') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))