def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.new_hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_file_status(partitionmap_file, args.force) for _ in filenames: check_file_status(_, args.force) check_space(filenames, args.force) print >>sys.stderr, 'loading partition map from:', partitionmap_file htable.load_partitionmap(partitionmap_file) for infile in filenames: print >>sys.stderr, 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print >>sys.stderr, 'output %d partitions for %s' % ( part_count, infile) print >>sys.stderr, 'partitions are in', outfile
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: check_file_status(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print >> sys.stderr, 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) print >> sys.stderr, 'saving merged to', output_file htable.save_partitionmap(output_file) if args.remove_subsets: print >> sys.stderr, 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.new_hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_file_status(partitionmap_file, args.force) for _ in filenames: check_file_status(_, args.force) check_space(filenames, args.force) print >> sys.stderr, 'loading partition map from:', partitionmap_file htable.load_partitionmap(partitionmap_file) for infile in filenames: print >> sys.stderr, 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print >> sys.stderr, 'output %d partitions for %s' % (part_count, infile) print >> sys.stderr, 'partitions are in', outfile
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_file_status(infile, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading k-mer counting table from', htfile htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print >>sys.stderr, 'writing to', output_filename output = open(output_filename, 'w') for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) print >> output, record.name, medn, ave, stdev, len(seq)
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_file_status(infile, args.force) check_space(infiles, args.force) print >> sys.stderr, 'loading k-mer counting table from', htfile htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print >> sys.stderr, 'writing to', output_filename output = open(output_filename, 'w') for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) print >> output, record.name, medn, ave, stdev, len(seq)
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: check_file_status(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print >>sys.stderr, 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) print >>sys.stderr, 'saving merged to', output_file htable.save_partitionmap(output_file) if args.remove_subsets: print >>sys.stderr, 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file)
def test_check_file_status_kfile(): fn = utils.get_temp_filename('thisfiledoesnotexist') check_file_status_exited = False try: check_file_status(fn, False) except SystemExit: check_file_status_exited = True assert check_file_status_exited
def main(): info('abundance-dist.py', ['counting']) args = get_parser().parse_args() infiles = [args.input_counting_table_filename, args.input_sequence_filename] for infile in infiles: check_file_status(infile, args.force) print ('hashtable from', args.input_counting_table_filename, file=sys.stderr) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._new_hashbits( # pylint: disable=protected-access kmer_size, hashsizes) print ('K:', kmer_size, file=sys.stderr) print ('HT sizes:', hashsizes, file=sys.stderr) print ('outputting to', args.output_histogram_filename, file=sys.stderr) if os.path.exists(args.output_histogram_filename): if not args.squash_output: print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) print('** squashing existing file %s' % args.output_histogram_filename, file=sys.stderr) print('preparing hist...', file=sys.stderr) abundances = counting_hash.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: print("ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) hash_fp = open(args.output_histogram_filename, 'w') sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print(_, i, sofar, round(frac, 3), file=hash_fp) if sofar == total: break
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 for read1, read2 in itertools.izip(screed.open(s1_file), screed.open(s2_file)): if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not name1.endswith('/1'): name1 += '/1' name2 = read2.name if not name2.endswith('/2'): name2 += '/2' assert name1[:-2] == name2[:-2], \ "This doesn't look like paired data! %s %s" % (name1, name2) read1.name = name1 read2.name = name2 write_record(read1, args.output) write_record(read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 for read1, read2 in itertools.izip(screed.open(s1_file), screed.open(s2_file)): if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not name1.endswith('/1'): name1 += '/1' name2 = read2.name if not name2.endswith('/2'): name2 += '/2' assert name1[:-2] == name2[:-2], \ "This doesn't look like paired data! %s %s" % (name1, name2) read1.name = name1 read2.name = name2 write_record(read1, args.output) write_record(read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >> sys.stderr, 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print >> sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print >> sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >> sys.stderr, 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print >> sys.stderr, 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() counting_ht = args.input_table infiles = args.input_filename for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >> sys.stderr, 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) ksize = htable.ksize() print >> sys.stderr, "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >> sys.stderr, 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print >> sys.stderr, 'output in', outfile
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() counting_ht = args.input_table infiles = args.input_filename for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) ksize = htable.ksize() print >>sys.stderr, "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_file_status(infile, args.force) filenames = [infile] check_space(filenames, args.force) out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') # is input file FASTQ or FASTA? Determine. is_fastq = False record = iter(screed.open(infile)).next() if hasattr(record, 'accuracy'): is_fastq = True counter1 = 0 counter2 = 0 index = None for index, record in enumerate(screed.open(infile)): if index % 100000 == 0: print >> sys.stderr, '...', index name = record.name if name.endswith('/1'): if is_fastq: print >> fp_out1, '@%s\n%s\n+\n%s' % (record.name, record.sequence, record.accuracy) else: print >> fp_out1, '>%s\n%s' % (record.name, record.sequence,) counter1 += 1 elif name.endswith('/2'): if is_fastq: print >> fp_out2, '@%s\n%s\n+\n%s' % (record.name, record.sequence, record.accuracy) else: print >> fp_out2, '>%s\n%s' % (record.name, record.sequence,) counter2 += 1 print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (index + 1, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_file_status(infile, args.force) filenames = [infile] check_space(filenames, args.force) if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE defaults with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None for index, record in enumerate(screed.open(infile)): if index % 100000 == 0 and index: print >> sys.stderr, '...', index name = record.name if name.endswith('/1'): write_record(record, fp_out1) counter1 += 1 elif name.endswith('/2'): write_record(record, fp_out2) counter2 += 1 print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (index + 1, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_file_status(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >> sys.stderr, 'reading file "%s"' % args.infile print >> sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >> sys.stderr, 'outputting orphans to "%s.se"' % outfile n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print >> sys.stderr, '...', index if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (n_pe * 2 + n_se, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_file_status(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >>sys.stderr, 'reading file "%s"' % args.infile print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print >>sys.stderr, '...', index if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (n_pe * 2 + n_se, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_file_status(infile, args.force) check_space([args.ptfile, args.fafile], args.force) print >> sys.stderr, 'loading k-mer presence table from', args.ptfile ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') if args.csv: f_curve_obj_csv = csv.writer(f_curve_obj) # write headers: f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer']) ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): if args.csv: f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]]) else: print >> f_curve_obj, list_curve[100 + i], list_curve[i] print >> sys.stderr, 'wrote to: ' + args.report_filename
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_file_status(infile, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading k-mer counting table from', htfile htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print >>sys.stderr, 'writing to', output_filename output = open(output_filename, 'w') if args.csv: output = csv.writer(output) # write headers: output.writerow(['name', 'median', 'average', 'stddev', 'seqlen']) parse_description = True # @legacy behavior: split seq headers if args.csv: parse_description = False # only enable if we're doing csv out for record in screed.open(input_filename, parse_description=parse_description): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) if args.csv: output.writerow([record.name, medn, ave, stdev, len(seq)]) else: print >> output, record.name, medn, ave, stdev, len(seq)
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_file_status(infile, args.force) check_space(infiles, args.force) print >> sys.stderr, 'loading k-mer counting table from', htfile htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print >> sys.stderr, 'writing to', output_filename output = open(output_filename, 'w') if args.csv: output = csv.writer(output) # write headers: output.writerow(['name', 'median', 'average', 'stddev', 'seqlen']) parse_description = True # @legacy behavior: split seq headers if args.csv: parse_description = False # only enable if we're doing csv out for record in screed.open(input_filename, parse_description=parse_description): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) if args.csv: output.writerow([record.name, medn, ave, stdev, len(seq)]) else: print >> output, record.name, medn, ave, stdev, len(seq)
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_file_status(infile, args.force) check_space([args.ptfile, args.fafile], args.force) print >>sys.stderr, 'loading k-mer presence table from', args.ptfile ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') if args.csv: f_curve_obj_csv = csv.writer(f_curve_obj) # write headers: f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer']) ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): if args.csv: f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]]) else: print >> f_curve_obj, list_curve[100 + i], list_curve[i] print >> sys.stderr, 'wrote to: ' + args.report_filename
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >> sys.stderr, 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >> sys.stderr, 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print >> sys.stderr, 'output in', outfile
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): # pylint: disable=too-many-locals,too-many-branches info('extract-partitions.py', ['graph']) args = get_parser().parse_args() distfilename = args.prefix + '.dist' n_unassigned = 0 for infile in args.part_filenames: check_file_status(infile, args.force) check_space(args.part_filenames, args.force) print >>sys.stderr, '---' print >>sys.stderr, 'reading partitioned files:', repr(args.part_filenames) if args.output_groups: print >>sys.stderr, 'outputting to files named "%s.groupN.fa"' % \ args.prefix print >>sys.stderr, 'min reads to keep a partition:', \ args.min_part_size print >>sys.stderr, 'max size of a group file:', args.max_size else: print >>sys.stderr, 'NOT outputting groups! Beware!' if args.output_unassigned: print >>sys.stderr, \ 'outputting unassigned reads to "%s.unassigned.fa"' % \ args.prefix print >>sys.stderr, 'partition size distribution will go to %s' \ % distfilename print >>sys.stderr, '---' # suffix = 'fa' is_fastq = False for index, read, pid in read_partition_file(args.part_filenames[0]): if hasattr(read, 'accuracy'): suffix = 'fq' is_fastq = True break for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if is_fastq: assert hasattr(read, 'accuracy'), \ "all input files must be FASTQ if the first one is" else: assert not hasattr(read, 'accuracy'), \ "all input files must be FASTA if the first one is" break if args.output_unassigned: unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w') count = {} for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if index % 100000 == 0: print >>sys.stderr, '...', index count[pid] = count.get(pid, 0) + 1 if pid == 0: n_unassigned += 1 if args.output_unassigned: write_record(read, unassigned_fp) if args.output_unassigned: unassigned_fp.close() if 0 in count: # eliminate unpartitioned sequences del count[0] # develop histogram of partition sizes dist = {} for pid, size in count.items(): dist[size] = dist.get(size, 0) + 1 # output histogram distfp = open(distfilename, 'w') total = 0 wtotal = 0 for counter, index in sorted(dist.items()): total += index wtotal += counter * index distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal)) distfp.close() if not args.output_groups: sys.exit(0) # sort groups by size divvy = sorted(count.items(), key=lambda y: y[1]) divvy = [y for y in divvy if y[1] > args.min_part_size] # divvy up into different groups, based on having max_size sequences # in each group. total = 0 group = set() group_n = 0 group_d = {} for partition_id, n_reads in divvy: group.add(partition_id) total += n_reads if total > args.max_size: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 group = set() total = 0 if group: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 print >>sys.stderr, '%d groups' % group_n if group_n == 0: print >>sys.stderr, 'nothing to output; exiting!' return # open a bunch of output files for the different groups group_fps = {} for _ in range(group_n): group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w') group_fps[_] = group_fp # write 'em all out! total_seqs = 0 part_seqs = 0 toosmall_parts = 0 for filename in args.part_filenames: for index, read, partition_id in read_partition_file(filename): total_seqs += 1 if index % 100000 == 0: print >>sys.stderr, '...x2', index if partition_id == 0: continue try: group_n = group_d[partition_id] except KeyError: assert count[partition_id] <= args.min_part_size toosmall_parts += 1 continue outfp = group_fps[group_n] write_record(read, outfp) part_seqs += 1 print >>sys.stderr, '---' print >>sys.stderr, 'Of %d total seqs,' % total_seqs print >>sys.stderr, 'extracted %d partitioned seqs into group files,' % \ part_seqs print >>sys.stderr, \ 'discarded %d sequences from small partitions (see -m),' % \ toosmall_parts print >>sys.stderr, 'and found %d unpartitioned sequences (see -U).' % \ n_unassigned print >>sys.stderr, '' print >>sys.stderr, 'Created %d group files named %s.groupXXXX.%s' % \ (len(group_fps), args.prefix, suffix)
def main(): info('load-into-counting.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name, args.force) check_space(args.input_sequence_filename, args.force) check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) print >>sys.stderr, 'Saving k-mer counting table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print >>sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) print >>sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >>info_fp, 'Total number of unique k-mers:', n_kmers print >>sys.stderr, 'saving', base htable.save(base) fp_rate = khmer.calc_expected_collisions(htable) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # Change 0.2 only if you really grok it. HINT: You don't. if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the k-mer counting table is too small", print >> sys.stderr, "for this data set. Increase tablesize/# tables." print >> sys.stderr, "**" sys.exit(1) print >>sys.stderr, 'DONE.' print >>sys.stderr, 'wrote to:', base + '.info'
def test_check_file_status_kfile_force(): fn = utils.get_temp_filename('thisfiledoesnotexist') try: check_file_status(fn, True) except OSError as e: assert False
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'accuracy'): # fastq! extension = 'fq' output_buffer = ReadBufferManager( max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >>sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >>sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >>sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension ), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open('{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, label) if hasattr(record, 'accuracy'): outfp.write('@{name}\n{seq}+{accuracy}\n'.format( name=record.name, seq=record.sequence, accuracy=record.accuracy)) else: outfp.write('>{name}\n{seq}\n'.format( name=record.name, seq=record.sequence)) except IOError as e: print >>sys.stderr, '!! ERROR !!', e print >>sys.stderr, '...error splitting input. exiting...' except IOError as e: print >>sys.stderr, '!! ERROR: !!', e print >>sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >>sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >>sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >>sys.stderr, '!! ERROR: !!', error print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'accuracy'): seq_str = fmt_fastq(name, seq, record.accuracy, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >>sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, '! WARNING: Sweep finished with errors !' print >>sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >>sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >>sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >>sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): info('load-graph.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames for _ in args.input_filenames: check_file_status(_, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable( (float(args.n_tables * args.min_tablesize) / 8.), args.force) print >>sys.stderr, 'Saving k-mer presence table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print >>sys.stderr, 'We WILL NOT build the tagset.' else: print >>sys.stderr, 'We WILL build the tagset', \ ' (for partitioning/traversal).' print >>sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print >>sys.stderr, 'consuming input', filename for num in xrange(args.threads): cur_thread = threading.Thread( target=target_method, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) print >>sys.stderr, 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print >>sys.stderr, 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate if args.write_fp_rate: print >> info_fp, \ '\nfalse positive rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables.") print >> sys.stderr, "**" if not args.force: sys.exit(1) print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt' if not args.no_build_tagset: print >> sys.stderr, 'and ' + base + '.tagset'
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in args.input_filenames: check_file_status(infile, args.force) check_space(args.input_filenames, args.force) print >>sys.stderr, 'Saving k-mer presence table to %s' % args.graphbase print >>sys.stderr, 'Loading kmers from sequences in %s' % \ repr(args.input_filenames) print >>sys.stderr, '--' print >>sys.stderr, 'SUBSET SIZE', args.subset_size print >>sys.stderr, 'N THREADS', args.threads print >>sys.stderr, '--' # load-graph print >>sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) for _, filename in enumerate(args.input_filenames): print >>sys.stderr, 'consuming input', filename htable.consume_fasta_and_tag(filename) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for" " this data set. Increase k-mer presence table " "size/num of tables.") print >> sys.stderr, "**" if not args.force: sys.exit(1) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps: ', \ 'stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print >>sys.stderr, 'starting %d threads' % args.threads print >>sys.stderr, '---' threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() assert threading.active_count() == args.threads + 1 print >>sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >>sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (args.graphbase,) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) htable = khmer.new_hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: print >>sys.stderr, 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) if args.remove_subsets: print >>sys.stderr, 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print >>sys.stderr, 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print >>sys.stderr, 'output %d partitions for %s' % ( part_count, infile) print >>sys.stderr, 'partitions are in', outfile
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename, args.force) check_space([args.input_sequence_filename], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print >>sys.stderr, 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) counting_hash.set_use_bigcount(args.bigcount) print >> sys.stderr, 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print >>sys.stderr, 'kmer_size:', counting_hash.ksize() print >>sys.stderr, 'k-mer counting table sizes:', \ counting_hash.hashsizes() print >>sys.stderr, 'outputting to', args.output_histogram_filename # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print >>sys.stderr, 'consuming input, round 1 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print >>sys.stderr, 'preparing hist from %s...' % \ args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print >>sys.stderr, 'consuming input, round 2 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print >>sys.stderr, 'Saving k-mer counting table ', args.savetable print >>sys.stderr, '...saving to', args.savetable counting_hash.save(args.savetable) print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') if s1_file == s2_file: print >> sys.stderr, ("ERROR: given only one filename, that " "doesn't contain _R1_. Exiting.") sys.exit(1) print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 screed_iter_1 = screed.open(s1_file, parse_description=False) screed_iter_2 = screed.open(s2_file, parse_description=False) for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print >> sys.stderr, ("ERROR: Input files contain different number" " of records.") sys.exit(1) if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not check_is_left(name1): name1 += '/1' name2 = read2.name if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print >>sys.stderr, "ERROR: This doesn't look like paired data! " \ "%s %s" % (read1.name, read2.name) sys.exit(1) write_record_pair(read1, read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output.name
def main(): info('find-knots.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_file_status(_) check_space(infiles) print >>sys.stderr, 'loading k-mer presence table %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print >>sys.stderr, 'loading stoptags %s.stoptags' % graphbase htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) print >>sys.stderr, '---' print >>sys.stderr, 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print >>sys.stderr, \ '(these output stoptags will include the already-loaded set)' print >>sys.stderr, '---' # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print >>sys.stderr, '<-', subset_file subset = htable.load_subset_partitionmap(subset_file) print >>sys.stderr, '** repartitioning subset... %s' % subset_file htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, '** merging subset... %s' % subset_file htable.merge_subset(subset) print >>sys.stderr, '** repartitioning, round 2... %s' % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, '** repartitioned size:', size print >>sys.stderr, 'saving stoptags binary' htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print >>sys.stderr, '(%d of %d)\n' % (index, len(pmap_files)) print >>sys.stderr, 'done!'
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_file_status(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' % \ args.num_reads print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ output_filename print >>sys.stderr, '' else: # > 1 print >>sys.stderr, 'Subsampling %d reads, %d times,' \ % (args.num_reads, num_samples), ' using reservoir sampling.' print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ % output_filename print >>sys.stderr, '' reads = [] for n in range(num_samples): reads.append([]) total = 0 # read through all the sequences and load/resample the reservoir for filename in args.filenames: print >>sys.stderr, 'opening', filename, 'for reading' for record in screed.open(filename): total += 1 if total % 10000 == 0: print >>sys.stderr, '...', total, 'reads scanned' if total >= args.max_reads: print >>sys.stderr, 'reached upper limit of %d reads',\ ' (see -M); exiting' \ % args.max_reads break # collect first N reads if total <= args.num_reads: for n in range(num_samples): reads[n].append(record) else: # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, total) if guess <= args.num_reads: reads[n][guess - 1] = record # output all the subsampled reads: if len(reads) == 1: print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for record in reads[0]: write_record(record, output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for record in reads[n]: write_record(record, output_file)
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_file_status(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >>sys.stderr, 'reading file "%s"' % args.infile print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile last_record = None last_name = None n_pe = 0 n_se = 0 record = None index = 0 for index, record in enumerate(screed.open(sys.argv[1])): if index % 100000 == 0 and index > 0: print '...', index name = record['name'].split()[0] if last_record: if is_pair(last_name, name): paired_fp.write(output_pair(last_record, record)) name, record = None, None n_pe += 1 else: single_fp.write(output_single(last_record)) n_se += 1 last_name = name last_record = record if last_record: if is_pair(last_name, name): paired_fp.write(output_pair(last_record, record)) name, record = None, None n_pe += 1 else: single_fp.write(output_single(last_record)) name, record = None, None n_se += 1 if record: single_fp.write(output_single(record)) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (index + 1, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('partition-graph.py', ['graph']) args = get_parser().parse_args() basename = args.basename filenames = [basename + '.pt', basename + '.tagset'] for _ in filenames: check_file_status(_, args.force) check_space(filenames, args.force) print >> sys.stderr, '--' print >> sys.stderr, 'SUBSET SIZE', args.subset_size print >> sys.stderr, 'N THREADS', args.threads if args.stoptags: print >> sys.stderr, 'stoptag file:', args.stoptags print >> sys.stderr, '--' print >> sys.stderr, 'loading ht %s.pt' % basename htable = khmer.load_hashbits(basename + '.pt') htable.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print >> sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps:', \ ' stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >> sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print >> sys.stderr, 'starting %d threads' % n_threads print >> sys.stderr, '---' threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print >> sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >> sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (basename,)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_file_status(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' % \ args.num_reads print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ output_filename print >> sys.stderr, '' else: # > 1 print >>sys.stderr, 'Subsampling %d reads, %d times,' \ % (args.num_reads, num_samples), ' using reservoir sampling.' print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ % output_filename print >> sys.stderr, '' reads = [] for n in range(num_samples): reads.append([]) total = 0 # read through all the sequences and load/resample the reservoir for filename in args.filenames: print >> sys.stderr, 'opening', filename, 'for reading' for record in screed.open(filename): total += 1 if total % 10000 == 0: print >> sys.stderr, '...', total, 'reads scanned' if total >= args.max_reads: print >>sys.stderr, 'reached upper limit of %d reads',\ ' (see -M); exiting' \ % args.max_reads break # collect first N reads if total <= args.num_reads: for n in range(num_samples): reads[n].append(record) else: # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, total) if guess <= args.num_reads: reads[n][guess - 1] = record # output all the subsampled reads: if len(reads) == 1: print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for record in reads[0]: output_file.write(output_single(record)) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for record in reads[n]: output_file.write(output_single(record))
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_file_status(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'quality'): # fastq! extension = 'fq' output_buffer = ReadBufferManager( max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >>sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >>sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >>sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension ), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open('{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, label) write_record(record, outfp) except IOError as e: print >>sys.stderr, '!! ERROR !!', e print >>sys.stderr, '...error splitting input. exiting...' except IOError as e: print >>sys.stderr, '!! ERROR: !!', e print >>sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >>sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >>sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >>sys.stderr, '!! ERROR: !!', error print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'quality'): seq_str = fmt_fastq(name, seq, record.quality, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >>sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, '! WARNING: Sweep finished with errors !' print >>sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >>sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >>sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >>sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): info('abundance-dist.py', ['counting']) args = get_parser().parse_args() infiles = [ args.input_counting_table_filename, args.input_sequence_filename ] for infile in infiles: check_file_status(infile, args.force) print('hashtable from', args.input_counting_table_filename, file=sys.stderr) counting_hash = khmer.load_counting_hash( args.input_counting_table_filename) kmer_size = counting_hash.ksize() hashsizes = counting_hash.hashsizes() tracking = khmer._Hashbits( # pylint: disable=protected-access kmer_size, hashsizes) print('K:', kmer_size, file=sys.stderr) print('HT sizes:', hashsizes, file=sys.stderr) print('outputting to', args.output_histogram_filename, file=sys.stderr) if os.path.exists(args.output_histogram_filename): if not args.squash_output: print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) print('** squashing existing file %s' % args.output_histogram_filename, file=sys.stderr) print('preparing hist...', file=sys.stderr) abundances = counting_hash.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: print( "ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) hash_fp = open(args.output_histogram_filename, 'w') if args.csv: hash_fp_csv = csv.writer(hash_fp) # write headers: hash_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) if args.csv: hash_fp_csv.writerow([_, i, sofar, round(frac, 3)]) else: print(_, i, sofar, round(frac, 3), file=hash_fp) if sofar == total: break
def main(): info('load-into-counting.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name, args.force) check_space(args.input_sequence_filename, args.force) check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) check_file_writable(base) check_file_writable(base + ".info") print >>sys.stderr, 'Saving k-mer counting table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) filename = None for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print >>sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) print >>sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >>info_fp, 'Total number of unique k-mers:', n_kmers print >>sys.stderr, 'saving', base htable.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.1.0", } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n") mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format( b=os.path.basename(base), fpr=fp_rate, k=n_kmers, fls=";".join(filenames))) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate print >>sys.stderr, 'DONE.' print >>sys.stderr, 'wrote to:', base + '.info'
def main(): # pylint: disable=too-many-locals,too-many-branches info('extract-partitions.py', ['graph']) args = get_parser().parse_args() distfilename = args.prefix + '.dist' n_unassigned = 0 for infile in args.part_filenames: check_file_status(infile, args.force) check_space(args.part_filenames, args.force) print >> sys.stderr, '---' print >> sys.stderr, 'reading partitioned files:', repr( args.part_filenames) if args.output_groups: print >>sys.stderr, 'outputting to files named "%s.groupN.fa"' % \ args.prefix print >>sys.stderr, 'min reads to keep a partition:', \ args.min_part_size print >> sys.stderr, 'max size of a group file:', args.max_size else: print >> sys.stderr, 'NOT outputting groups! Beware!' if args.output_unassigned: print >>sys.stderr, \ 'outputting unassigned reads to "%s.unassigned.fa"' % \ args.prefix print >>sys.stderr, 'partition size distribution will go to %s' \ % distfilename print >> sys.stderr, '---' # suffix = 'fa' is_fastq = False for index, read, pid in read_partition_file(args.part_filenames[0]): if hasattr(read, 'accuracy'): suffix = 'fq' is_fastq = True break for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if is_fastq: assert hasattr(read, 'accuracy'), \ "all input files must be FASTQ if the first one is" else: assert not hasattr(read, 'accuracy'), \ "all input files must be FASTA if the first one is" break if args.output_unassigned: unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w') count = {} for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if index % 100000 == 0: print >> sys.stderr, '...', index count[pid] = count.get(pid, 0) + 1 if pid == 0: n_unassigned += 1 if args.output_unassigned: write_record(read, unassigned_fp) if args.output_unassigned: unassigned_fp.close() if 0 in count: # eliminate unpartitioned sequences del count[0] # develop histogram of partition sizes dist = {} for pid, size in count.items(): dist[size] = dist.get(size, 0) + 1 # output histogram distfp = open(distfilename, 'w') total = 0 wtotal = 0 for counter, index in sorted(dist.items()): total += index wtotal += counter * index distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal)) distfp.close() if not args.output_groups: sys.exit(0) # sort groups by size divvy = sorted(count.items(), key=lambda y: y[1]) divvy = [y for y in divvy if y[1] > args.min_part_size] # divvy up into different groups, based on having max_size sequences # in each group. total = 0 group = set() group_n = 0 group_d = {} for partition_id, n_reads in divvy: group.add(partition_id) total += n_reads if total > args.max_size: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 group = set() total = 0 if group: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 print >> sys.stderr, '%d groups' % group_n if group_n == 0: print >> sys.stderr, 'nothing to output; exiting!' return # open a bunch of output files for the different groups group_fps = {} for _ in range(group_n): group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w') group_fps[_] = group_fp # write 'em all out! total_seqs = 0 part_seqs = 0 toosmall_parts = 0 for filename in args.part_filenames: for index, read, partition_id in read_partition_file(filename): total_seqs += 1 if index % 100000 == 0: print >> sys.stderr, '...x2', index if partition_id == 0: continue try: group_n = group_d[partition_id] except KeyError: assert count[partition_id] <= args.min_part_size toosmall_parts += 1 continue outfp = group_fps[group_n] write_record(read, outfp) part_seqs += 1 print >> sys.stderr, '---' print >> sys.stderr, 'Of %d total seqs,' % total_seqs print >>sys.stderr, 'extracted %d partitioned seqs into group files,' % \ part_seqs print >>sys.stderr, \ 'discarded %d sequences from small partitions (see -m),' % \ toosmall_parts print >>sys.stderr, 'and found %d unpartitioned sequences (see -U).' % \ n_unassigned print >> sys.stderr, '' print >>sys.stderr, 'Created %d group files named %s.groupXXXX.%s' % \ (len(group_fps), args.prefix, suffix)
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_file_status(infile, args.force) filenames = [infile] check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. for index, is_pair, record1, record2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index: print >> sys.stderr, '...', index # are we requiring pairs? if args.force_paired and not is_pair: print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ record1.name sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print >>sys.stderr, \ "Unrecognized format for read pair information: %s" % name print >>sys.stderr, "Exiting." sys.exit(1) print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (counter1 + counter2, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('find-knots.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_file_status(_) check_space(infiles) print >> sys.stderr, 'loading k-mer presence table %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') print >> sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print >> sys.stderr, 'loading stoptags %s.stoptags' % graphbase htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) print >> sys.stderr, '---' print >> sys.stderr, 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print >>sys.stderr, \ '(these output stoptags will include the already-loaded set)' print >> sys.stderr, '---' # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print >> sys.stderr, '<-', subset_file subset = htable.load_subset_partitionmap(subset_file) print >> sys.stderr, '** repartitioning subset... %s' % subset_file htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, '** merging subset... %s' % subset_file htable.merge_subset(subset) print >> sys.stderr, '** repartitioning, round 2... %s' % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, '** repartitioned size:', size print >> sys.stderr, 'saving stoptags binary' htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print >> sys.stderr, '(%d of %d)\n' % (index, len(pmap_files)) print >> sys.stderr, 'done!'
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name) check_space(args.input_sequence_filename) check_space_for_hashtable(args.n_tables * args.min_tablesize) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n,) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_file_status(name, False) check_space(args.input_sequence_filename, False) check_space_for_hashtable(args.n_tables * args.min_tablesize, False) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n, ) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " this data set. Increase tablesize/# tables.") print >> sys.stderr, "**" sys.exit(1) print 'DONE.'
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_file_status(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) report_on_config(args) print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print >> sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >> sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >> sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >> sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >> sys.stderr, 'wrote to: ', outfile
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_file_status(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) report_on_config(args) print >>sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print >>sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >>sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >>sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >>sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >>sys.stderr, 'wrote to: ', outfile
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') if s1_file == s2_file: print >>sys.stderr, ("ERROR: given only one filename, that " "doesn't contain _R1_. Exiting.") sys.exit(1) print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 screed_iter_1 = screed.open(s1_file, parse_description=False) screed_iter_2 = screed.open(s2_file, parse_description=False) for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print >>sys.stderr, ("ERROR: Input files contain different number" " of records.") sys.exit(1) if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not check_is_left(name1): name1 += '/1' name2 = read2.name if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print >>sys.stderr, "ERROR: This doesn't look like paired data! " \ "%s %s" % (read1.name, read2.name) sys.exit(1) write_record_pair(read1, read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output.name
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename, args.force) check_space([args.input_sequence_filename], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') if args.csv: hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow(['abundance', 'count', 'cumulative', 'cumulative_fraction']) print >>sys.stderr, 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) counting_hash.set_use_bigcount(args.bigcount) print >> sys.stderr, 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print >>sys.stderr, 'kmer_size:', counting_hash.ksize() print >>sys.stderr, 'k-mer counting table sizes:', \ counting_hash.hashsizes() print >>sys.stderr, 'outputting to', args.output_histogram_filename # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print >>sys.stderr, 'consuming input, round 1 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print >>sys.stderr, 'preparing hist from %s...' % \ args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print >>sys.stderr, 'consuming input, round 2 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) if args.csv: hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) else: print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print >>sys.stderr, 'Saving k-mer counting table ', args.savetable print >>sys.stderr, '...saving to', args.savetable counting_hash.save(args.savetable) print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
def main(): info('partition-graph.py', ['graph']) args = get_parser().parse_args() basename = args.basename filenames = [basename + '.pt', basename + '.tagset'] for _ in filenames: check_file_status(_, args.force) check_space(filenames, args.force) print >>sys.stderr, '--' print >>sys.stderr, 'SUBSET SIZE', args.subset_size print >>sys.stderr, 'N THREADS', args.threads if args.stoptags: print >>sys.stderr, 'stoptag file:', args.stoptags print >>sys.stderr, '--' print >>sys.stderr, 'loading ht %s.pt' % basename htable = khmer.load_hashbits(basename + '.pt') htable.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps:', \ ' stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print >>sys.stderr, 'starting %d threads' % n_threads print >>sys.stderr, '---' threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print >>sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >>sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (basename,)
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_file_status(infile, args.force) filenames = [infile] check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. for index, is_pair, record1, record2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index: print >> sys.stderr, '...', index # are we requiring pairs? if args.force_paired and not is_pair: print >>sys.stderr, 'ERROR, %s is not part of a pair' % \ record1.name sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print >>sys.stderr, \ "Unrecognized format for read pair information: %s" % name print >> sys.stderr, "Exiting." sys.exit(1) print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (counter1 + counter2, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2