def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 input_filename = None for index, input_filename in enumerate(args.input_filenames): if args.single_output_filename != '': output_name = args.single_output_filename outfp = open(args.single_output_filename, 'a') else: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median(input_filename, outfp, htable, args, report_fp) except IOError as err: handle_error(err, output_name, input_filename, args.fail_save, htable) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print 'output in', output_name if (args.dump_frequency > 0 and index > 0 and index % args.dump_frequency == 0): print 'Backup: Saving k-mer counting file through', input_filename if args.savetable: hashname = args.savetable print '...saving to', hashname else: hashname = 'backup.ct' print 'Nothing given for savetable, saving to', hashname htable.save(hashname) if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" if not args.force: sys.exit(1)
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'accuracy'): # fastq! extension = 'fq' output_buffer = ReadBufferManager( max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >>sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >>sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >>sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension ), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open('{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, label) if hasattr(record, 'accuracy'): outfp.write('@{name}\n{seq}+{accuracy}\n'.format( name=record.name, seq=record.sequence, accuracy=record.accuracy)) else: outfp.write('>{name}\n{seq}\n'.format( name=record.name, seq=record.sequence)) except IOError as e: print >>sys.stderr, '!! ERROR !!', e print >>sys.stderr, '...error splitting input. exiting...' except IOError as e: print >>sys.stderr, '!! ERROR: !!', e print >>sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >>sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >>sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >>sys.stderr, '!! ERROR: !!', error print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'accuracy'): seq_str = fmt_fastq(name, seq, record.accuracy, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >>sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, '! WARNING: Sweep finished with errors !' print >>sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >>sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >>sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >>sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'accuracy'): # fastq! extension = 'fq' output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >> sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >> sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >> sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open( '{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open( '{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels( record.sequence, label) if hasattr(record, 'accuracy'): outfp.write('@{name}\n{seq}+{accuracy}\n'.format( name=record.name, seq=record.sequence, accuracy=record.accuracy)) else: outfp.write('>{name}\n{seq}\n'.format( name=record.name, seq=record.sequence)) except IOError as e: print >> sys.stderr, '!! ERROR !!', e print >> sys.stderr, '...error splitting input. exiting...' except IOError as e: print >> sys.stderr, '!! ERROR: !!', e print >> sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >> sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >> sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >> sys.stderr, '!! ERROR: !!', error print >> sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'accuracy'): seq_str = fmt_fastq(name, seq, record.accuracy, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >> sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >> sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >> sys.stderr, '! WARNING: Sweep finished with errors !' print >> sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >> sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >> sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >> sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >> sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >> sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >> sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report report_frequency = args.report_frequency check_valid_file_exists(args.input_filenames) check_space(args.input_filenames) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 for index, input_filename in enumerate(args.input_filenames): total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median( input_filename, htable, args, report_fp, report_frequency) except IOError as err: handle_error(err, input_filename) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1)
def main(): info("sweep-reads-buffered.py", ["sweep"]) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype="hashbits") K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files) output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >>sys.stderr, "consuming input sequences..." if args.label_by_pid: print >>sys.stderr, "...labeling by partition id (pid)" ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >>sys.stderr, "...labeling by sequence" for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, "...consumed {n} sequences...".format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, "...labeling to create groups of size {s}".format(s=args.group_size) label = -1 g = 0 try: outfp = open("{pref}_base_{g}.fa".format(pref=output_pref, g=g), "wb") for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open("{pref}_base_{g}.fa".format(pref=output_pref, g=g), "wb") if n % 50000 == 0: print >>sys.stderr, "...consumed {n} sequences...".format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, label) outfp.write(">{name}\n{seq}\n".format(name=record.name, seq=record.sequence)) except IOError as e: print >>sys.stderr, "!! ERROR !!", e print >>sys.stderr, "...error splitting input. exiting..." except IOError as e: print >>sys.stderr, "!! ERROR: !!", e print >>sys.stderr, "...error consuming \ {i}. exiting...".format( i=input_fastp ) print >>sys.stderr, "done consuming input sequence. \ added {t} tags and {l} \ labels...".format( t=ht.n_tags(), l=ht.n_labels() ) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >>sys.stderr, "** sweeping {read_file} for labels...".format(read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >>sys.stderr, "!! ERROR: !!", error print >>sys.stderr, "*** Could not open {fn}, skipping...".format(fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, "\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)".format( n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t ) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, "multi") n_mlabeled += 1 label_dict["multi"] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, "orphaned") label_dict["orphaned"] += 1 print >>sys.stderr, "** End of file {fn}...".format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >>sys.stderr, "** End of run..." output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, "! WARNING: Sweep finished with errors !" print >>sys.stderr, "** {writee} reads not written".format(writee=output_buffer.num_write_errors) print >>sys.stderr, "** {filee} errors opening files".format(filee=output_buffer.num_file_errors) print >>sys.stderr, "swept {n_reads} for labels...".format(n_reads=n_labeled + n_orphaned) print >>sys.stderr, "...with {nc} labeled and {no} orphaned".format(nc=n_labeled, no=n_orphaned) print >>sys.stderr, "...and {nmc} multilabeled".format(nmc=n_mlabeled) print >>sys.stderr, "** outputting label number distribution..." fn = os.path.join(outdir, "{pref}.dist.txt".format(pref=output_pref)) with open(fn, "wb") as outfp: for nc in label_number_dist: outfp.write("{nc}\n".format(nc=nc)) fn = os.path.join(outdir, "{pref}.counts.csv".format(pref=output_pref)) print >>sys.stderr, "** outputting label read counts..." with open(fn, "wb") as outfp: for k in label_dict: outfp.write("{l},{c}\n".format(l=k, c=label_dict[k]))