def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_file_status(infile) check_space([args.ptfile, args.fafile]) print 'loading k-mer presence table from', args.ptfile ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n' f_curve_obj.write(to_print)
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: check_file_status(_) check_space(pmap_files) for pmap_file in pmap_files: print 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) print 'saving merged to', output_file htable.save_partitionmap(output_file) if args.remove_subsets: print 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.new_hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_file_status(partitionmap_file) for _ in filenames: check_file_status(_) check_space(filenames) print 'loading partition map from:', partitionmap_file htable.load_partitionmap(partitionmap_file) for infile in filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print 'output %d partitions for %s' % (part_count, infile) print 'partitions are in', outfile
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_file_status(infile) check_space(infiles) print 'loading k-mer counting table from', htfile htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print 'writing to', output_filename output = open(output_filename, 'w') for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) print >> output, record.name, medn, ave, stdev, len(seq)
def main(): info('abundance-dist.py', ['counting']) args = get_parser().parse_args() infiles = [ args.input_counting_table_filename, args.input_sequence_filename ] for infile in infiles: check_file_status(infile) check_space(infiles) print('hashtable from', args.input_counting_table_filename) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._new_hashbits( # pylint: disable=protected-access kmer_size, hashsizes) print('K:', kmer_size) print('HT sizes:', hashsizes) print('outputting to', args.output_histogram_filename) if os.path.exists(args.output_histogram_filename): if not args.squash_output: print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) print('** squashing existing file %s' % args.output_histogram_filename) print('preparing hist...') abundances = counting_hash.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: print( "ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) hash_fp = open(args.output_histogram_filename, 'w') sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print(_, i, sofar, round(frac, 3), file=hash_fp) if sofar == total: break
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: check_file_status(_) check_space(pmap_files) for pmap_file in pmap_files: print >> sys.stderr, 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) print >> sys.stderr, 'saving merged to', output_file htable.save_partitionmap(output_file) if args.remove_subsets: print >> sys.stderr, 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): info("filter-abund-single.py", ["counting"]) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print "consuming input, round 1 --", args.datafile for _ in xrange(args.threads): cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print "filtering", args.datafile outfile = os.path.basename(args.datafile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print "output in", outfile if args.savetable: print "Saving k-mer counting table filename", args.savetable print "...saving to", args.savetable htable.save(args.savetable)
def main(): info('abundance-dist.py', ['counting']) args = get_parser().parse_args() infiles = [args.input_counting_table_filename, args.input_sequence_filename] for infile in infiles: check_file_status(infile) check_space(infiles) print('hashtable from', args.input_counting_table_filename) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._new_hashbits( # pylint: disable=protected-access kmer_size, hashsizes) print('K:', kmer_size) print('HT sizes:', hashsizes) print('outputting to', args.output_histogram_filename) if os.path.exists(args.output_histogram_filename): if not args.squash_output: print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) print('** squashing existing file %s' % args.output_histogram_filename) print('preparing hist...') abundances = counting_hash.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: print("ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) hash_fp = open(args.output_histogram_filename, 'w') sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print(_, i, sofar, round(frac, 3), file=hash_fp) if sofar == total: break
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_file_status(infile, args.force) filenames = [infile] check_space(filenames, args.force) out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') # is input file FASTQ or FASTA? Determine. is_fastq = False record = iter(screed.open(infile)).next() if hasattr(record, 'accuracy'): is_fastq = True counter1 = 0 counter2 = 0 index = None for index, record in enumerate(screed.open(infile)): if index % 100000 == 0: print >> sys.stderr, '...', index name = record.name if name.endswith('/1'): if is_fastq: print >> fp_out1, '@%s\n%s\n+\n%s' % ( record.name, record.sequence, record.accuracy) else: print >> fp_out1, '>%s\n%s' % ( record.name, record.sequence, ) counter1 += 1 elif name.endswith('/2'): if is_fastq: print >> fp_out2, '@%s\n%s\n+\n%s' % ( record.name, record.sequence, record.accuracy) else: print >> fp_out2, '>%s\n%s' % ( record.name, record.sequence, ) counter2 += 1 print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (index + 1, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 for read1, read2 in itertools.izip(screed.open(s1_file), screed.open(s2_file)): if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not name1.endswith('/1'): name1 += '/1' name2 = read2.name if not name2.endswith('/2'): name2 += '/2' assert name1[:-2] == name2[:-2], \ "This doesn't look like paired data! %s %s" % (name1, name2) read1.name = name1 read2.name = name2 args.output.write(output_pair(read1, read2)) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_file_status(_) check_space(infiles) print >>sys.stderr, 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print >>sys.stderr, 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_file_status(_) check_space(infiles) print 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_file_status(infile) filenames = [infile] check_space(filenames) out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') # is input file FASTQ or FASTA? Determine. is_fastq = False record = iter(screed.open(infile)).next() if hasattr(record, 'accuracy'): is_fastq = True counter1 = 0 counter2 = 0 index = None for index, record in enumerate(screed.open(infile)): if index % 100000 == 0: print >> sys.stderr, '...', index name = record.name if name.endswith('/1'): if is_fastq: print >> fp_out1, '@%s\n%s\n+\n%s' % (record.name, record.sequence, record.accuracy) else: print >> fp_out1, '>%s\n%s' % (record.name, record.sequence,) counter1 += 1 elif name.endswith('/2'): if is_fastq: print >> fp_out2, '@%s\n%s\n+\n%s' % (record.name, record.sequence, record.accuracy) else: print >> fp_out2, '>%s\n%s' % (record.name, record.sequence,) counter2 += 1 print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (index + 1, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() counting_ht = args.input_table infiles = args.input_filename for _ in infiles: check_file_status(_) check_space(infiles) print 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) ksize = htable.ksize() print "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_file_status(_) check_space(infiles) print 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): info("filter-stoptags.py", ["graph"]) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_file_status(_) check_space(infiles) print "loading stop tags, with K", args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".stopfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.n_threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.n_threads * 64 * 1024) for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.n_threads) threads = [] print 'consuming input', filename for _ in xrange(args.n_threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'mid-save', base htable.save(base) open(base + '.info', 'w').write('through %s' % filename) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filename) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report report_frequency = args.report_frequency check_valid_file_exists(args.input_filenames) check_space(args.input_filenames) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 for index, input_filename in enumerate(args.input_filenames): total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median( input_filename, htable, args, report_fp, report_frequency) except IOError as err: handle_error(err, input_filename) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1)
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print >> sys.stderr, 'Saving k-mer counting table to %s' % base print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.threads) threads = [] print >> sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print >> sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >> info_fp, 'Total number of unique k-mers:', n_kmers print >> sys.stderr, 'saving', base htable.save(base) fp_rate = khmer.calc_expected_collisions(htable) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # Change 0.2 only if you really grok it. HINT: You don't. if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the k-mer counting table is too small", print >> sys.stderr, "for this data set. Increase tablesize/# tables." print >> sys.stderr, "**" sys.exit(1) print >> sys.stderr, 'DONE.' print >> sys.stderr, 'wrote to:', base + '.info'
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'accuracy'): # fastq! extension = 'fq' output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >> sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >> sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >> sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open( '{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open( '{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels( record.sequence, label) if hasattr(record, 'accuracy'): outfp.write('@{name}\n{seq}+{accuracy}\n'.format( name=record.name, seq=record.sequence, accuracy=record.accuracy)) else: outfp.write('>{name}\n{seq}\n'.format( name=record.name, seq=record.sequence)) except IOError as e: print >> sys.stderr, '!! ERROR !!', e print >> sys.stderr, '...error splitting input. exiting...' except IOError as e: print >> sys.stderr, '!! ERROR: !!', e print >> sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >> sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >> sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >> sys.stderr, '!! ERROR: !!', error print >> sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'accuracy'): seq_str = fmt_fastq(name, seq, record.accuracy, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >> sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >> sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >> sys.stderr, '! WARNING: Sweep finished with errors !' print >> sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >> sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >> sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >> sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >> sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >> sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >> sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename) check_space([args.input_sequence_filename]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) counting_hash.set_use_bigcount(args.bigcount) print 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print 'kmer_size:', counting_hash.ksize() print 'k-mer counting table sizes:', counting_hash.hashsizes() print 'outputting to', args.output_histogram_filename khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024) # start loading rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print 'consuming input, round 1 --', args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( counting_hash.n_occupied()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print 'preparing hist from %s...' % args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print 'consuming input, round 2 --', args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print 'Saving k-mer counting table ', args.savetable print '...saving to', args.savetable counting_hash.save(args.savetable)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_file_status(_) check_space(args.filenames) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") sys.exit(-1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print 'Subsampling %d reads using reservoir sampling.' % args.num_reads print 'Subsampled reads will be placed in %s' % output_filename print '' else: # > 1 print 'Subsampling %d reads, %d times, using reservoir sampling.' % \ (args.num_reads, num_samples) print 'Subsampled reads will be placed in %s.N' % output_filename print '' reads = [] for n in range(num_samples): reads.append([]) total = 0 # read through all the sequences and load/resample the reservoir for filename in args.filenames: print 'opening', filename, 'for reading' for record in screed.open(filename): total += 1 if total % 10000 == 0: print '...', total, 'reads scanned' if total >= args.max_reads: print 'reached upper limit of %d reads (see -M); exiting' \ % args.max_reads break # collect first N reads if total <= args.num_reads: for n in range(num_samples): reads[n].append(record) else: # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, total) if guess <= args.num_reads: reads[n][guess - 1] = record # output all the subsampled reads: if len(reads) == 1: print 'Writing %d sequences to %s' % (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for record in reads[0]: output_file.write(output_single(record)) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print 'Writing %d sequences to %s' % (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for record in reads[n]: output_file.write(output_single(record))
def main(): # pylint: disable=too-many-locals,too-many-branches info('extract-partitions.py', ['graph']) args = get_parser().parse_args() distfilename = args.prefix + '.dist' n_unassigned = 0 for infile in args.part_filenames: check_file_status(infile) check_space(args.part_filenames) print '---' print 'reading partitioned files:', repr(args.part_filenames) if args.output_groups: print 'outputting to files named "%s.groupN.fa"' % args.prefix print 'min reads to keep a partition:', args.min_part_size print 'max size of a group file:', args.max_size else: print 'NOT outputting groups! Beware!' if args.output_unassigned: print 'outputting unassigned reads to "%s.unassigned.fa"' % args.prefix print 'partition size distribution will go to %s' % distfilename print '---' # suffix = 'fa' is_fastq = False for index, read, pid in read_partition_file(args.part_filenames[0]): if hasattr(read, 'accuracy'): suffix = 'fq' is_fastq = True break for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if is_fastq: assert hasattr(read, 'accuracy'), \ "all input files must be FASTQ if the first one is" else: assert not hasattr(read, 'accuracy'), \ "all input files must be FASTA if the first one is" break if args.output_unassigned: unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w') count = {} for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if index % 100000 == 0: print '...', index count[pid] = count.get(pid, 0) + 1 if pid == 0: n_unassigned += 1 if args.output_unassigned: print >>unassigned_fp, output_single(read) if args.output_unassigned: unassigned_fp.close() if 0 in count: # eliminate unpartitioned sequences del count[0] # develop histogram of partition sizes dist = {} for pid, size in count.items(): dist[size] = dist.get(size, 0) + 1 # output histogram distfp = open(distfilename, 'w') total = 0 wtotal = 0 for counter, index in sorted(dist.items()): total += index wtotal += counter * index distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal)) distfp.close() if not args.output_groups: sys.exit(0) # sort groups by size divvy = sorted(count.items(), key=lambda y: y[1]) divvy = [y for y in divvy if y[1] > args.min_part_size] # divvy up into different groups, based on having max_size sequences # in each group. total = 0 group = set() group_n = 0 group_d = {} for partition_id, n_reads in divvy: group.add(partition_id) total += n_reads if total > args.max_size: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 group = set() total = 0 if group: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 print '%d groups' % group_n if group_n == 0: print 'nothing to output; exiting!' return # open a bunch of output files for the different groups group_fps = {} for _ in range(group_n): group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w') group_fps[_] = group_fp # write 'em all out! total_seqs = 0 part_seqs = 0 toosmall_parts = 0 for filename in args.part_filenames: for index, read, partition_id in read_partition_file(filename): total_seqs += 1 if index % 100000 == 0: print '...x2', index if partition_id == 0: continue try: group_n = group_d[partition_id] except KeyError: assert count[partition_id] <= args.min_part_size toosmall_parts += 1 continue outfp = group_fps[group_n] outfp.write(output_single(read)) part_seqs += 1 print '---' print 'Of %d total seqs,' % total_seqs print 'extracted %d partitioned seqs into group files,' % part_seqs print 'discarded %d sequences from small partitions (see -m),' % \ toosmall_parts print 'and found %d unpartitioned sequences (see -U).' % n_unassigned print '' print 'Created %d group files named %s.groupXXXX.%s' % (len(group_fps), args.prefix, suffix)
def main(): info('filter-abund-single.py', ['counting']) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print >> sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >> sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >> sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >> sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >> sys.stderr, 'wrote to: ', outfile
def main(): info("find-knots.py", ["graph"]) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + ".pt", graphbase + ".tagset"] if os.path.exists(graphbase + ".stoptags"): infiles.append(graphbase + ".stoptags") for _ in infiles: check_file_status(_) check_space(infiles) print >>sys.stderr, "loading k-mer presence table %s.pt" % graphbase htable = khmer.load_hashbits(graphbase + ".pt") print >>sys.stderr, "loading tagset %s.tagset..." % graphbase htable.load_tagset(graphbase + ".tagset") initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + ".stoptags"): print >>sys.stderr, "loading stoptags %s.stoptags" % graphbase htable.load_stop_tags(graphbase + ".stoptags") initial_stoptags = True pmap_files = glob.glob(args.graphbase + ".subset.*.pmap") print >>sys.stderr, "loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0]) print >>sys.stderr, "---" print >>sys.stderr, "output stoptags will be in", graphbase + ".stoptags" if initial_stoptags: print >>sys.stderr, "(these output stoptags will include the already-loaded set)" print >>sys.stderr, "---" # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print >>sys.stderr, "<-", subset_file subset = htable.load_subset_partitionmap(subset_file) print >>sys.stderr, "** repartitioning subset... %s" % subset_file htable.repartition_largest_partition( subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD ) print >>sys.stderr, "** merging subset... %s" % subset_file htable.merge_subset(subset) print >>sys.stderr, "** repartitioning, round 2... %s" % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD ) print >>sys.stderr, "** repartitioned size:", size print >>sys.stderr, "saving stoptags binary" htable.save_stop_tags(graphbase + ".stoptags") os.rename(subset_file, subset_file + ".processed") print >>sys.stderr, "(%d of %d)\n" % (index, len(pmap_files)) print >>sys.stderr, "done!"
def main(): info('find-knots.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_file_status(_) check_space(infiles) print >> sys.stderr, 'loading k-mer presence table %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') print >> sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print >> sys.stderr, 'loading stoptags %s.stoptags' % graphbase htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) print >> sys.stderr, '---' print >> sys.stderr, 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print >>sys.stderr, \ '(these output stoptags will include the already-loaded set)' print >> sys.stderr, '---' # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print >> sys.stderr, '<-', subset_file subset = htable.load_subset_partitionmap(subset_file) print >> sys.stderr, '** repartitioning subset... %s' % subset_file htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, '** merging subset... %s' % subset_file htable.merge_subset(subset) print >> sys.stderr, '** repartitioning, round 2... %s' % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, '** repartitioned size:', size print >> sys.stderr, 'saving stoptags binary' htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print >> sys.stderr, '(%d of %d)\n' % (index, len(pmap_files)) print >> sys.stderr, 'done!'
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 input_filename = None for index, input_filename in enumerate(args.input_filenames): if args.single_output_filename != '': output_name = args.single_output_filename outfp = open(args.single_output_filename, 'a') else: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median(input_filename, outfp, htable, args, report_fp) except IOError as err: handle_error(err, output_name, input_filename, args.fail_save, htable) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print 'output in', output_name if (args.dump_frequency > 0 and index > 0 and index % args.dump_frequency == 0): print 'Backup: Saving k-mer counting file through', input_filename if args.savetable: hashname = args.savetable print '...saving to', hashname else: hashname = 'backup.ct' print 'Nothing given for savetable, saving to', hashname htable.save(hashname) if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" if not args.force: sys.exit(1)
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_file_status(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >>sys.stderr, 'reading file "%s"' % args.infile print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile last_record = None last_name = None n_pe = 0 n_se = 0 record = None index = 0 for index, record in enumerate(screed.open(sys.argv[1])): if index % 100000 == 0 and index > 0: print '...', index name = record['name'].split()[0] if last_record: if is_pair(last_name, name): paired_fp.write(output_pair(last_record, record)) name, record = None, None n_pe += 1 else: single_fp.write(output_single(last_record)) n_se += 1 last_name = name last_record = record if last_record: if is_pair(last_name, name): paired_fp.write(output_pair(last_record, record)) name, record = None, None n_pe += 1 else: single_fp.write(output_single(last_record)) name, record = None, None n_se += 1 if record: single_fp.write(output_single(record)) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (index + 1, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('filter-abund-single.py', ['counting']) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >>sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >>sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >>sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >>sys.stderr, 'wrote to: ', outfile
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_file_status(_) check_space(args.filenames) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' output_file = open(output_filename, 'w') print 'Subsampling %d reads using reservoir sampling.' % args.num_reads print 'Subsampled reads will be placed in %s' % output_filename print '' reads = [] total = 0 # read through all the sequences and load/resample the reservoir for filename in args.filenames: print 'opening', filename, 'for reading' for record in screed.open(filename): total += 1 if total % 10000 == 0: print '...', total, 'reads scanned' if total >= args.max_reads: print 'reached upper limit of %d reads (see -M); exiting' \ % args.max_reads break # collect first N reads if total <= args.num_reads: reads.append(record) else: # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling guess = random.randint(1, total) if guess <= args.num_reads: reads[guess - 1] = record # output all the subsampled reads: for record in reads: output_file.write(output_single(record)) print '' print 'wrote %d reads to %s' % (len(reads), output_filename)
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename) check_space([args.input_sequence_filename]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print >>sys.stderr, 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) counting_hash.set_use_bigcount(args.bigcount) print >> sys.stderr, 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print >>sys.stderr, 'kmer_size:', counting_hash.ksize() print >>sys.stderr, 'k-mer counting table sizes:', \ counting_hash.hashsizes() print >>sys.stderr, 'outputting to', args.output_histogram_filename khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024) # start loading rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 1 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print >>sys.stderr, 'preparing hist from %s...' % \ args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename, args.threads) threads = [] print >>sys.stderr, 'consuming input, round 2 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print >>sys.stderr, 'Saving k-mer counting table ', args.savetable print >>sys.stderr, '...saving to', args.savetable counting_hash.save(args.savetable) print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
def main(): info('partition-graph.py', ['graph']) args = get_parser().parse_args() basename = args.basename filenames = [basename + '.pt', basename + '.tagset'] for _ in filenames: check_file_status(_) check_space(filenames) print >> sys.stderr, '--' print >> sys.stderr, 'SUBSET SIZE', args.subset_size print >> sys.stderr, 'N THREADS', args.threads if args.stoptags: print >> sys.stderr, 'stoptag file:', args.stoptags print >> sys.stderr, '--' print >> sys.stderr, 'loading ht %s.pt' % basename htable = khmer.load_hashbits(basename + '.pt') htable.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print >> sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps:', \ ' stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >> sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print >> sys.stderr, 'starting %d threads' % n_threads print >> sys.stderr, '---' threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print >> sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >> sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (basename,)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_file_status(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' % \ args.num_reads print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ output_filename print >>sys.stderr, '' else: # > 1 print >>sys.stderr, 'Subsampling %d reads, %d times,' \ % (args.num_reads, num_samples), ' using reservoir sampling.' print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ % output_filename print >>sys.stderr, '' reads = [] for n in range(num_samples): reads.append([]) total = 0 # read through all the sequences and load/resample the reservoir for filename in args.filenames: print >>sys.stderr, 'opening', filename, 'for reading' for record in screed.open(filename): total += 1 if total % 10000 == 0: print >>sys.stderr, '...', total, 'reads scanned' if total >= args.max_reads: print >>sys.stderr, 'reached upper limit of %d reads',\ ' (see -M); exiting' \ % args.max_reads break # collect first N reads if total <= args.num_reads: for n in range(num_samples): reads[n].append(record) else: # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, total) if guess <= args.num_reads: reads[n][guess - 1] = record # output all the subsampled reads: if len(reads) == 1: print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for record in reads[0]: output_file.write(output_single(record)) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for record in reads[n]: output_file.write(output_single(record))
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for _ in xrange(n_threads): cur_thrd = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'accuracy'): # fastq! extension = 'fq' output_buffer = ReadBufferManager( max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >>sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >>sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >>sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension ), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open('{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, label) if hasattr(record, 'accuracy'): outfp.write('@{name}\n{seq}+{accuracy}\n'.format( name=record.name, seq=record.sequence, accuracy=record.accuracy)) else: outfp.write('>{name}\n{seq}\n'.format( name=record.name, seq=record.sequence)) except IOError as e: print >>sys.stderr, '!! ERROR !!', e print >>sys.stderr, '...error splitting input. exiting...' except IOError as e: print >>sys.stderr, '!! ERROR: !!', e print >>sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >>sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >>sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >>sys.stderr, '!! ERROR: !!', error print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'accuracy'): seq_str = fmt_fastq(name, seq, record.accuracy, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >>sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, '! WARNING: Sweep finished with errors !' print >>sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >>sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >>sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >>sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): info('load-graph.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames n_threads = int(args.n_threads) for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print 'We WILL NOT build the tagset.' else: print 'We WILL build the tagset (for partitioning/traversal).' print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser config = khmer.get_config() config.set_reads_input_buffer_size(n_threads * 64 * 1024) for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input', filename for _ in xrange(n_threads): cur_thrd = threading.Thread(target=target_method, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if args.write_fp_rate: print >> info_fp, \ '\nfalse positive rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" sys.exit(1)
def main(): info('partition-graph.py', ['graph']) args = get_parser().parse_args() basename = args.basename filenames = [basename + '.pt', basename + '.tagset'] for _ in filenames: check_file_status(_) check_space(filenames) print >>sys.stderr, '--' print >>sys.stderr, 'SUBSET SIZE', args.subset_size print >>sys.stderr, 'N THREADS', args.threads if args.stoptags: print >>sys.stderr, 'stoptag file:', args.stoptags print >>sys.stderr, '--' print >>sys.stderr, 'loading ht %s.pt' % basename htable = khmer.load_hashbits(basename + '.pt') htable.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps:', \ ' stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print >>sys.stderr, 'starting %d threads' % n_threads print >>sys.stderr, '---' threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print >>sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >>sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (basename,)
def main(): info("load-graph.py", ["graph"]) args = get_parser().parse_args() report_on_config(args, hashtype="hashbits") base = args.output_filename filenames = args.input_filenames for _ in args.input_filenames: check_file_status(_) check_space(args.input_filenames) check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.0) print >>sys.stderr, "Saving k-mer presence table to %s" % base print >>sys.stderr, "Loading kmers from sequences in %s" % repr(filenames) if args.no_build_tagset: print >>sys.stderr, "We WILL NOT build the tagset." else: print >>sys.stderr, "We WILL build the tagset", " (for partitioning/traversal)." config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print >>sys.stderr, "making k-mer presence table" htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, 1) print >>sys.stderr, "consuming input", filename target_method(rparser) if args.report_total_kmers: print >>sys.stderr, "Total number of unique k-mers: {0}".format(htable.n_unique_kmers()) print >>sys.stderr, "saving k-mer presence table in", base + ".pt" htable.save(base + ".pt") if not args.no_build_tagset: print >>sys.stderr, "saving tagset in", base + ".tagset" htable.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, "fp rate estimated to be %1.3f" % fp_rate if args.write_fp_rate: print >> info_fp, "\nfalse positive rate estimated to be %1.3f" % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, "**" print >>sys.stderr, ( "** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables." ) print >>sys.stderr, "**" sys.exit(1) print >>sys.stderr, "wrote to", base + ".info and", base + ".pt" if not args.no_build_tagset: print >>sys.stderr, "and " + base + ".tagset"
def main(): info('load-into-counting.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print >>sys.stderr, 'Saving k-mer counting table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) htable.set_use_bigcount(args.bigcount) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename, args.threads) threads = [] print >>sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for _ in threads: _.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize) print >>sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >>info_fp, 'Total number of unique k-mers:', n_kmers print >>sys.stderr, 'saving', base htable.save(base) fp_rate = khmer.calc_expected_collisions(htable) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # Change 0.2 only if you really grok it. HINT: You don't. if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the k-mer counting table is too small", print >> sys.stderr, "for this data set. Increase tablesize/# tables." print >> sys.stderr, "**" sys.exit(1) print >>sys.stderr, 'DONE.' print >>sys.stderr, 'wrote to:', base + '.info'
def main(): info('find-knots.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_file_status(_) check_space(infiles) print 'loading k-mer presence table %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') print 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print 'loading stoptags %s.stoptags' % graphbase htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) print '---' print 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print '(these output stoptags will include the already-loaded set)' print '---' # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print '<-', subset_file subset = htable.load_subset_partitionmap(subset_file) print '** repartitioning subset... %s' % subset_file htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print '** merging subset... %s' % subset_file htable.merge_subset(subset) print '** repartitioning, round 2... %s' % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print '** repartitioned size:', size print 'saving stoptags binary' htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print '(%d of %d)\n' % (index, len(pmap_files)) print 'done!'
def main(): # pylint: disable=too-many-locals,too-many-branches info('extract-partitions.py', ['graph']) args = get_parser().parse_args() distfilename = args.prefix + '.dist' n_unassigned = 0 for infile in args.part_filenames: check_file_status(infile) check_space(args.part_filenames) print '---' print 'reading partitioned files:', repr(args.part_filenames) if args.output_groups: print 'outputting to files named "%s.groupN.fa"' % args.prefix print 'min reads to keep a partition:', args.min_part_size print 'max size of a group file:', args.max_size else: print 'NOT outputting groups! Beware!' if args.output_unassigned: print 'outputting unassigned reads to "%s.unassigned.fa"' % args.prefix print 'partition size distribution will go to %s' % distfilename print '---' # suffix = 'fa' is_fastq = False for index, read, pid in read_partition_file(args.part_filenames[0]): if hasattr(read, 'accuracy'): suffix = 'fq' is_fastq = True break for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if is_fastq: assert hasattr(read, 'accuracy'), \ "all input files must be FASTQ if the first one is" else: assert not hasattr(read, 'accuracy'), \ "all input files must be FASTA if the first one is" break if args.output_unassigned: unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w') count = {} for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if index % 100000 == 0: print '...', index count[pid] = count.get(pid, 0) + 1 if pid == 0: n_unassigned += 1 if args.output_unassigned: print >> unassigned_fp, output_single(read) if args.output_unassigned: unassigned_fp.close() if 0 in count: # eliminate unpartitioned sequences del count[0] # develop histogram of partition sizes dist = {} for pid, size in count.items(): dist[size] = dist.get(size, 0) + 1 # output histogram distfp = open(distfilename, 'w') total = 0 wtotal = 0 for counter, index in sorted(dist.items()): total += index wtotal += counter * index distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal)) distfp.close() if not args.output_groups: sys.exit(0) # sort groups by size divvy = sorted(count.items(), key=lambda y: y[1]) divvy = [y for y in divvy if y[1] > args.min_part_size] # divvy up into different groups, based on having max_size sequences # in each group. total = 0 group = set() group_n = 0 group_d = {} for partition_id, n_reads in divvy: group.add(partition_id) total += n_reads if total > args.max_size: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 group = set() total = 0 if group: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 print '%d groups' % group_n if group_n == 0: print 'nothing to output; exiting!' return # open a bunch of output files for the different groups group_fps = {} for _ in range(group_n): group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w') group_fps[_] = group_fp # write 'em all out! total_seqs = 0 part_seqs = 0 toosmall_parts = 0 for filename in args.part_filenames: for index, read, partition_id in read_partition_file(filename): total_seqs += 1 if index % 100000 == 0: print '...x2', index if partition_id == 0: continue try: group_n = group_d[partition_id] except KeyError: assert count[partition_id] <= args.min_part_size toosmall_parts += 1 continue outfp = group_fps[group_n] outfp.write(output_single(read)) part_seqs += 1 print '---' print 'Of %d total seqs,' % total_seqs print 'extracted %d partitioned seqs into group files,' % part_seqs print 'discarded %d sequences from small partitions (see -m),' % \ toosmall_parts print 'and found %d unpartitioned sequences (see -U).' % n_unassigned print '' print 'Created %d group files named %s.groupXXXX.%s' % ( len(group_fps), args.prefix, suffix)
def main(): info("sweep-reads-buffered.py", ["sweep"]) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype="hashbits") K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files) output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >>sys.stderr, "consuming input sequences..." if args.label_by_pid: print >>sys.stderr, "...labeling by partition id (pid)" ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >>sys.stderr, "...labeling by sequence" for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, "...consumed {n} sequences...".format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, "...labeling to create groups of size {s}".format(s=args.group_size) label = -1 g = 0 try: outfp = open("{pref}_base_{g}.fa".format(pref=output_pref, g=g), "wb") for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open("{pref}_base_{g}.fa".format(pref=output_pref, g=g), "wb") if n % 50000 == 0: print >>sys.stderr, "...consumed {n} sequences...".format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, label) outfp.write(">{name}\n{seq}\n".format(name=record.name, seq=record.sequence)) except IOError as e: print >>sys.stderr, "!! ERROR !!", e print >>sys.stderr, "...error splitting input. exiting..." except IOError as e: print >>sys.stderr, "!! ERROR: !!", e print >>sys.stderr, "...error consuming \ {i}. exiting...".format( i=input_fastp ) print >>sys.stderr, "done consuming input sequence. \ added {t} tags and {l} \ labels...".format( t=ht.n_tags(), l=ht.n_labels() ) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >>sys.stderr, "** sweeping {read_file} for labels...".format(read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >>sys.stderr, "!! ERROR: !!", error print >>sys.stderr, "*** Could not open {fn}, skipping...".format(fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, "\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)".format( n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t ) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, "multi") n_mlabeled += 1 label_dict["multi"] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, "orphaned") label_dict["orphaned"] += 1 print >>sys.stderr, "** End of file {fn}...".format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >>sys.stderr, "** End of run..." output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, "! WARNING: Sweep finished with errors !" print >>sys.stderr, "** {writee} reads not written".format(writee=output_buffer.num_write_errors) print >>sys.stderr, "** {filee} errors opening files".format(filee=output_buffer.num_file_errors) print >>sys.stderr, "swept {n_reads} for labels...".format(n_reads=n_labeled + n_orphaned) print >>sys.stderr, "...with {nc} labeled and {no} orphaned".format(nc=n_labeled, no=n_orphaned) print >>sys.stderr, "...and {nmc} multilabeled".format(nmc=n_mlabeled) print >>sys.stderr, "** outputting label number distribution..." fn = os.path.join(outdir, "{pref}.dist.txt".format(pref=output_pref)) with open(fn, "wb") as outfp: for nc in label_number_dist: outfp.write("{nc}\n".format(nc=nc)) fn = os.path.join(outdir, "{pref}.counts.csv".format(pref=output_pref)) print >>sys.stderr, "** outputting label read counts..." with open(fn, "wb") as outfp: for k in label_dict: outfp.write("{l},{c}\n".format(l=k, c=label_dict[k]))
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in args.input_filenames: check_file_status(infile) check_space(args.input_filenames) print 'Saving k-mer presence table to %s' % args.graphbase print 'Loading kmers from sequences in %s' % repr(args.input_filenames) print '--' print 'SUBSET SIZE', args.subset_size print 'N THREADS', args.n_threads print '--' # load-graph print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) for _, filename in enumerate(args.input_filenames): print 'consuming input', filename htable.consume_fasta_and_tag(filename) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for" " this data set. Increase k-mer presence table " "size/num of tables.") print >> sys.stderr, "**" sys.exit(1) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print '** This script brakes for lumps: stop_big_traversals is true.' else: print '** Traverse all the things: stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print 'enqueued %d subset tasks' % n_subsets open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.n_threads: args.n_threads = n_subsets # start threads! print 'starting %d threads' % args.n_threads print '---' threads = [] for _ in range(args.n_threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() print 'done starting threads' # wait for threads for _ in threads: _.join() print '---' print 'done making subsets! see %s.subset.*.pmap' % (args.graphbase,) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) htable = khmer.new_hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: print 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) if args.remove_subsets: print 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print 'output %d partitions for %s' % (part_count, infile) print 'partitions are in', outfile
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n, ) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in args.input_filenames: check_file_status(infile) check_space(args.input_filenames) print 'Saving k-mer presence table to %s' % args.graphbase print 'Loading kmers from sequences in %s' % repr(args.input_filenames) print '--' print 'SUBSET SIZE', args.subset_size print 'N THREADS', args.n_threads print '--' # load-graph print 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) for _, filename in enumerate(args.input_filenames): print 'consuming input', filename htable.consume_fasta_and_tag(filename) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for" " this data set. Increase k-mer presence table " "size/num of tables.") print >> sys.stderr, "**" sys.exit(1) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print '** This script brakes for lumps: stop_big_traversals is true.' else: print '** Traverse all the things: stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print 'enqueued %d subset tasks' % n_subsets open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.n_threads: args.n_threads = n_subsets # start threads! print 'starting %d threads' % args.n_threads print '---' threads = [] for _ in range(args.n_threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() print 'done starting threads' # wait for threads for _ in threads: _.join() print '---' print 'done making subsets! see %s.subset.*.pmap' % (args.graphbase, ) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print 'loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]) htable = khmer.new_hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: print 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) if args.remove_subsets: print 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print 'output %d partitions for %s' % (part_count, infile) print 'partitions are in', outfile
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n,) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'