Example #1
0
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    for infile in [args.ptfile, args.fafile]:
        check_file_status(infile)

    check_space([args.ptfile, args.fafile])

    print 'loading k-mer presence table from', args.ptfile
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')

    ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n'
        f_curve_obj.write(to_print)
Example #2
0
def main():
    info('merge-partitions.py', ['graph'])
    args = get_parser().parse_args()

    output_file = args.graphbase + '.pmap.merged'
    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])

    ksize = args.ksize
    htable = khmer.new_hashbits(ksize, 1, 1)

    for _ in pmap_files:
        check_file_status(_)

    check_space(pmap_files)

    for pmap_file in pmap_files:
        print 'merging', pmap_file
        htable.merge_subset_from_disk(pmap_file)

    print 'saving merged to', output_file
    htable.save_partitionmap(output_file)

    if args.remove_subsets:
        print 'removing pmap files'
        for pmap_file in pmap_files:
            os.unlink(pmap_file)
def main():
    info('annotate-partitions.py', ['graph'])
    args = get_parser().parse_args()

    ksize = args.ksize
    filenames = args.input_filenames
    htable = khmer.new_hashbits(ksize, 1, 1)

    partitionmap_file = args.graphbase + '.pmap.merged'

    check_file_status(partitionmap_file)
    for _ in filenames:
        check_file_status(_)

    check_space(filenames)

    print 'loading partition map from:', partitionmap_file
    htable.load_partitionmap(partitionmap_file)

    for infile in filenames:
        print 'outputting partitions for', infile
        outfile = os.path.basename(infile) + '.part'
        part_count = htable.output_partitions(infile, outfile)
        print 'output %d partitions for %s' % (part_count, infile)
        print 'partitions are in', outfile
Example #4
0
def main():
    info('count-overlap.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    for infile in [args.ptfile, args.fafile]:
        check_file_status(infile)

    check_space([args.ptfile, args.fafile])

    print 'loading k-mer presence table from', args.ptfile
    ht1 = khmer.load_hashbits(args.ptfile)
    kmer_size = ht1.ksize()

    output = open(args.report_filename, 'w')
    f_curve_obj = open(args.report_filename + '.curve', 'w')

    ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables)

    (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)

    printout1 = """\
dataset1(pt file): %s
dataset2: %s

# of unique k-mers in dataset2: %d
# of overlap unique k-mers: %d

""" % (args.ptfile, args.fafile, n_unique, n_overlap)
    output.write(printout1)

    for i in range(100):
        to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n'
        f_curve_obj.write(to_print)
Example #5
0
def main():
    info('count-median.py', ['diginorm'])
    args = get_parser().parse_args()

    htfile = args.ctfile
    input_filename = args.input
    output_filename = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print 'loading k-mer counting table from', htfile
    htable = khmer.load_counting_hash(htfile)
    ksize = htable.ksize()

    print 'writing to', output_filename
    output = open(output_filename, 'w')

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'G')

        if ksize <= len(seq):
            medn, ave, stdev = htable.get_median_count(seq)
            print >> output, record.name, medn, ave, stdev, len(seq)
Example #6
0
def main():
    info('abundance-dist.py', ['counting'])
    args = get_parser().parse_args()
    infiles = [
        args.input_counting_table_filename, args.input_sequence_filename
    ]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print('hashtable from', args.input_counting_table_filename)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._new_hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print('K:', kmer_size)
    print('HT sizes:', hashsizes)
    print('outputting to', args.output_histogram_filename)

    if os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            print('ERROR: %s exists; not squashing.' %
                  args.output_histogram_filename,
                  file=sys.stderr)
            sys.exit(1)

        print('** squashing existing file %s' % args.output_histogram_filename)

    print('preparing hist...')
    abundances = counting_hash.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        print(
            "ERROR: abundance distribution is uniformly zero; "
            "nothing to report.",
            file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)
    hash_fp = open(args.output_histogram_filename, 'w')

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print(_, i, sofar, round(frac, 3), file=hash_fp)

        if sofar == total:
            break
Example #7
0
def main():
    info('count-median.py', ['diginorm'])
    args = get_parser().parse_args()

    htfile = args.ctfile
    input_filename = args.input
    output_filename = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print 'loading k-mer counting table from', htfile
    htable = khmer.load_counting_hash(htfile)
    ksize = htable.ksize()

    print 'writing to', output_filename
    output = open(output_filename, 'w')

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'G')

        if ksize <= len(seq):
            medn, ave, stdev = htable.get_median_count(seq)
            print >> output, record.name, medn, ave, stdev, len(seq)
Example #8
0
def main():
    info('merge-partitions.py', ['graph'])
    args = get_parser().parse_args()

    output_file = args.graphbase + '.pmap.merged'
    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \
        (len(pmap_files), pmap_files[0])

    ksize = args.ksize
    htable = khmer.new_hashbits(ksize, 1, 1)

    for _ in pmap_files:
        check_file_status(_)

    check_space(pmap_files)

    for pmap_file in pmap_files:
        print >> sys.stderr, 'merging', pmap_file
        htable.merge_subset_from_disk(pmap_file)

    print >> sys.stderr, 'saving merged to', output_file
    htable.save_partitionmap(output_file)

    if args.remove_subsets:
        print >> sys.stderr, 'removing pmap files'
        for pmap_file in pmap_files:
            os.unlink(pmap_file)
Example #9
0
def main():
    info("filter-abund-single.py", ["counting"])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print "making k-mer counting table"
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print "consuming input, round 1 --", args.datafile
    for _ in xrange(args.threads):
        cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    fp_rate = khmer.calc_expected_collisions(htable)
    print "fp rate estimated to be %1.3f" % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print "filtering", args.datafile
    outfile = os.path.basename(args.datafile) + ".abundfilt"
    outfp = open(outfile, "w")

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print "output in", outfile

    if args.savetable:
        print "Saving k-mer counting table filename", args.savetable
        print "...saving to", args.savetable
        htable.save(args.savetable)
Example #10
0
def main():
    info('annotate-partitions.py', ['graph'])
    args = get_parser().parse_args()

    ksize = args.ksize
    filenames = args.input_filenames
    htable = khmer.new_hashbits(ksize, 1, 1)

    partitionmap_file = args.graphbase + '.pmap.merged'

    check_file_status(partitionmap_file)
    for _ in filenames:
        check_file_status(_)

    check_space(filenames)

    print 'loading partition map from:', partitionmap_file
    htable.load_partitionmap(partitionmap_file)

    for infile in filenames:
        print 'outputting partitions for', infile
        outfile = os.path.basename(infile) + '.part'
        part_count = htable.output_partitions(infile, outfile)
        print 'output %d partitions for %s' % (part_count, infile)
        print 'partitions are in', outfile
Example #11
0
def main():
    info('abundance-dist.py', ['counting'])
    args = get_parser().parse_args()
    infiles = [args.input_counting_table_filename,
               args.input_sequence_filename]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print('hashtable from', args.input_counting_table_filename)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._new_hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print('K:', kmer_size)
    print('HT sizes:', hashsizes)
    print('outputting to', args.output_histogram_filename)

    if os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            print('ERROR: %s exists; not squashing.' %
                  args.output_histogram_filename,
                  file=sys.stderr)
            sys.exit(1)

        print('** squashing existing file %s' % args.output_histogram_filename)

    print('preparing hist...')
    abundances = counting_hash.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        print("ERROR: abundance distribution is uniformly zero; "
              "nothing to report.", file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)
    hash_fp = open(args.output_histogram_filename, 'w')

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print(_, i, sofar, round(frac, 3), file=hash_fp)

        if sofar == total:
            break
Example #12
0
def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_file_status(infile, args.force)
    filenames = [infile]
    check_space(filenames, args.force)

    out1 = os.path.basename(infile) + '.1'
    out2 = os.path.basename(infile) + '.2'
    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    # is input file FASTQ or FASTA? Determine.
    is_fastq = False
    record = iter(screed.open(infile)).next()

    if hasattr(record, 'accuracy'):
        is_fastq = True

    counter1 = 0
    counter2 = 0
    index = None
    for index, record in enumerate(screed.open(infile)):
        if index % 100000 == 0:
            print >> sys.stderr, '...', index

        name = record.name
        if name.endswith('/1'):
            if is_fastq:
                print >> fp_out1, '@%s\n%s\n+\n%s' % (
                    record.name, record.sequence, record.accuracy)
            else:
                print >> fp_out1, '>%s\n%s' % (
                    record.name,
                    record.sequence,
                )
            counter1 += 1
        elif name.endswith('/2'):
            if is_fastq:
                print >> fp_out2, '@%s\n%s\n+\n%s' % (
                    record.name, record.sequence, record.accuracy)
            else:
                print >> fp_out2, '>%s\n%s' % (
                    record.name,
                    record.sequence,
                )
            counter2 += 1

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (index + 1, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2
Example #13
0
def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    for read1, read2 in itertools.izip(screed.open(s1_file),
                                       screed.open(s2_file)):
        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not name1.endswith('/1'):
            name1 += '/1'
        name2 = read2.name
        if not name2.endswith('/2'):
            name2 += '/2'

        assert name1[:-2] == name2[:-2], \
            "This doesn't look like paired data! %s %s" % (name1, name2)

        read1.name = name1
        read2.name = name2
        args.output.write(output_pair(read1, read2))

    print >> sys.stderr, 'final: interleaved %d pairs' % counter

    print >> sys.stderr, 'output written to', args.output
Example #14
0
def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    for read1, read2 in itertools.izip(screed.open(s1_file),
                                       screed.open(s2_file)):
        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not name1.endswith('/1'):
            name1 += '/1'
        name2 = read2.name
        if not name2.endswith('/2'):
            name2 += '/2'

        assert name1[:-2] == name2[:-2], \
            "This doesn't look like paired data! %s %s" % (name1, name2)

        read1.name = name1
        read2.name = name2
        args.output.write(output_pair(read1, read2))

    print >> sys.stderr, 'final: interleaved %d pairs' % counter

    print >> sys.stderr, 'output written to', args.output
Example #15
0
def main():

    info('make-initial-stoptags.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print >>sys.stderr, 'loading htable %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >>sys.stderr, 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # divide up into SUBSET_SIZE fragments
    divvy = htable.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end
    subset = htable.do_subset_partition(start, end)

    # now, repartition...
    print >>sys.stderr, 'repartitioning to find HCKs.'
    htable.repartition_largest_partition(subset, counting,
                                         EXCURSION_DISTANCE,
                                         EXCURSION_KMER_THRESHOLD,
                                         EXCURSION_KMER_COUNT_THRESHOLD)

    print >>sys.stderr, 'saving stop tags'
    htable.save_stop_tags(graphbase + '.stoptags')
    print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main():

    info('make-initial-stoptags.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print 'loading htable %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    print 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # divide up into SUBSET_SIZE fragments
    divvy = htable.divide_tags_into_subsets(args.subset_size)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print 'doing pre-partitioning from', start, 'to', end
    subset = htable.do_subset_partition(start, end)

    # now, repartition...
    print 'repartitioning to find HCKs.'
    htable.repartition_largest_partition(subset, counting,
                                         EXCURSION_DISTANCE,
                                         EXCURSION_KMER_THRESHOLD,
                                         EXCURSION_KMER_COUNT_THRESHOLD)

    print 'saving stop tags'
    htable.save_stop_tags(graphbase + '.stoptags')
    print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
Example #17
0
def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_file_status(infile)
    filenames = [infile]
    check_space(filenames)

    out1 = os.path.basename(infile) + '.1'
    out2 = os.path.basename(infile) + '.2'
    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    # is input file FASTQ or FASTA? Determine.
    is_fastq = False
    record = iter(screed.open(infile)).next()

    if hasattr(record, 'accuracy'):
        is_fastq = True

    counter1 = 0
    counter2 = 0
    index = None
    for index, record in enumerate(screed.open(infile)):
        if index % 100000 == 0:
            print >> sys.stderr, '...', index

        name = record.name
        if name.endswith('/1'):
            if is_fastq:
                print >> fp_out1, '@%s\n%s\n+\n%s' % (record.name,
                                                      record.sequence,
                                                      record.accuracy)
            else:
                print >> fp_out1, '>%s\n%s' % (record.name, record.sequence,)
            counter1 += 1
        elif name.endswith('/2'):
            if is_fastq:
                print >> fp_out2, '@%s\n%s\n+\n%s' % (record.name,
                                                      record.sequence,
                                                      record.accuracy)
            else:
                print >> fp_out2, '>%s\n%s' % (record.name, record.sequence,)
            counter2 += 1

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (index + 1, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2
Example #18
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    counting_ht = args.input_table
    infiles = args.input_filename

    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print 'loading hashtable'
    htable = khmer.load_counting_hash(counting_ht)
    ksize = htable.ksize()

    print "K:", ksize

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Example #19
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    counting_ht = args.input_table
    infiles = args.input_filename

    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print 'loading hashtable'
    htable = khmer.load_counting_hash(counting_ht)
    ksize = htable.ksize()

    print "K:", ksize

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Example #20
0
def main():
    info('filter-stoptags.py', ['graph'])
    args = get_parser().parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print 'loading stop tags, with K', args.ksize
    htable = khmer.new_hashbits(args.ksize, 1, 1)
    htable.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Example #21
0
def main():
    info("filter-stoptags.py", ["graph"])
    args = get_parser().parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print "loading stop tags, with K", args.ksize
    htable = khmer.new_hashbits(args.ksize, 1, 1)
    htable.load_stop_tags(stoptags)

    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".stopfilt"

        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Example #22
0
def main():
    info('filter-stoptags.py', ['graph'])
    args = get_parser().parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_file_status(_, args.force)

    check_space(infiles, args.force)

    print >>sys.stderr, 'loading stop tags, with K', args.ksize
    htable = khmer.new_hashbits(args.ksize, 1, 1)
    htable.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'filtering', infile
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print >>sys.stderr, 'output in', outfile
Example #23
0
def main():

    info('load-into-counting.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.n_threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(args.n_threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print 'mid-save', base
            htable.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
Example #24
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('saturate-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    report_frequency = args.report_frequency

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0

    for index, input_filename in enumerate(args.input_filenames):
        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(
                input_filename, htable, args, report_fp, report_frequency)
        except IOError as err:
            handle_error(err, input_filename)
            if not args.force:
                print >> sys.stderr, '** Exiting!'
                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename,
                            kept=total - discarded, total=total,
                            perc=int(100. - discarded / float(total) * 100.))

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set.  Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(1)
Example #25
0
def main():

    info('load-into-counting.py', ['counting'])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print >> sys.stderr, 'Saving k-mer counting table to %s' % base
    print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    filename = None

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.threads)
        threads = []
        print >> sys.stderr, 'consuming input', filename
        for _ in xrange(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print >> sys.stderr, 'mid-save', base
            htable.save(base)
        with open(base + '.info', 'a') as info_fh:
            print >> info_fh, 'through', filename

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
        with open(base + '.info', 'a') as info_fp:
            print >> info_fp, 'Total number of unique k-mers:', n_kmers

    print >> sys.stderr, 'saving', base
    htable.save(base)

    fp_rate = khmer.calc_expected_collisions(htable)

    with open(base + '.info', 'a') as info_fp:
        print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print >> sys.stderr, "Writing summmary info to", mr_file
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.1.0",
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
                mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
                    b=os.path.basename(base),
                    fpr=fp_rate,
                    k=n_kmers,
                    fls=";".join(filenames)))

    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # Change 0.2 only if you really grok it.  HINT: You don't.
    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the k-mer counting table is too small",
        print >> sys.stderr, "for this data set. Increase tablesize/# tables."
        print >> sys.stderr, "**"
        sys.exit(1)

    print >> sys.stderr, 'DONE.'
    print >> sys.stderr, 'wrote to:', base + '.info'
Example #26
0
def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = get_parser()
    args = parser.parse_args()

    if args.min_tablesize < MIN_HSIZE:
        args.min_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype='hashbits')

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_file_status(args.input_fastp)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = ix.next()
    del ix

    extension = 'fa'
    if hasattr(record, 'accuracy'):  # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size,
                                      output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print >> sys.stderr, 'consuming input sequences...'
        if args.label_by_pid:
            print >> sys.stderr, '...labeling by partition id (pid)'
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print >> sys.stderr, '...labeling by sequence'
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print >>sys.stderr, \
                        '...consumed {n} sequences...'.format(n=n)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print >>sys.stderr, \
                '...labeling to create groups of size {s}'.format(
                    s=args.group_size)
            label = -1
            g = 0
            try:
                outfp = open(
                    '{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                   g=g,
                                                   ext=extension), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open(
                                '{pref}_base_{g}.{ext}'.format(
                                    pref=output_pref, g=g, ext=extension),
                                'wb')
                    if n % 50000 == 0:
                        print >>sys.stderr, \
                            '...consumed {n} sequences...'.format(n=n)
                    ht.consume_sequence_and_tag_with_labels(
                        record.sequence, label)

                    if hasattr(record, 'accuracy'):
                        outfp.write('@{name}\n{seq}+{accuracy}\n'.format(
                            name=record.name,
                            seq=record.sequence,
                            accuracy=record.accuracy))
                    else:
                        outfp.write('>{name}\n{seq}\n'.format(
                            name=record.name, seq=record.sequence))

            except IOError as e:
                print >> sys.stderr, '!! ERROR !!', e
                print >> sys.stderr, '...error splitting input. exiting...'

    except IOError as e:
        print >> sys.stderr, '!! ERROR: !!', e
        print >> sys.stderr, '...error consuming \
                            {i}. exiting...'.format(i=input_fastp)

    print >> sys.stderr, 'done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.n_tags(), l=ht.n_labels())

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print >> sys.stderr, '** sweeping {read_file} for labels...'.format(
            read_file=read_file)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print >> sys.stderr, '!! ERROR: !!', error
            print >> sys.stderr, '*** Could not open {fn}, skipping...'.format(
                fn=read_file)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'accuracy'):
                        seq_str = fmt_fastq(name, seq, record.accuracy, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print >> sys.stderr, '** End of file {fn}...'.format(fn=read_file)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print >> sys.stderr, '** End of run...'
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print >> sys.stderr, '! WARNING: Sweep finished with errors !'
        print >> sys.stderr, '** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors)
        print >> sys.stderr, '** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors)

    print >> sys.stderr, 'swept {n_reads} for labels...'.format(
        n_reads=n_labeled + n_orphaned)
    print >> sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
        nc=n_labeled, no=n_orphaned)
    print >> sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)

    print >> sys.stderr, '** outputting label number distribution...'
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'wb') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print >> sys.stderr, '** outputting label read counts...'
    with open(fn, 'wb') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
Example #27
0
def main():

    info('load-into-counting.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.n_threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(args.n_threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print 'mid-save', base
            htable.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
Example #28
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_file_status(args.input_sequence_filename)
    check_space([args.input_sequence_filename])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)

    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \
            args.output_histogram_filename
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')

    print 'making k-mer counting table'
    counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                            args.n_tables, args.threads)
    counting_hash.set_use_bigcount(args.bigcount)

    print 'building k-mer tracking table'
    tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
                                  args.n_tables)

    print 'kmer_size:', counting_hash.ksize()
    print 'k-mer counting table sizes:', counting_hash.hashsizes()
    print 'outputting to', args.output_histogram_filename

    khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print 'consuming input, round 1 --', args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            counting_hash.n_occupied())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print 'preparing hist from %s...' % args.input_sequence_filename
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print 'consuming input, round 2 --', args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print >> sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> hist_fp, _, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savetable:
        print 'Saving k-mer counting table ', args.savetable
        print '...saving to', args.savetable
        counting_hash.save(args.savetable)
Example #29
0
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_file_status(_)

    check_space(args.filenames)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            sys.exit(-1)
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print 'Subsampling %d reads using reservoir sampling.' % args.num_reads
        print 'Subsampled reads will be placed in %s' % output_filename
        print ''
    else:  # > 1
        print 'Subsampling %d reads, %d times, using reservoir sampling.' % \
            (args.num_reads, num_samples)
        print 'Subsampled reads will be placed in %s.N' % output_filename
        print ''

    reads = []
    for n in range(num_samples):
        reads.append([])

    total = 0

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print 'opening', filename, 'for reading'
        for record in screed.open(filename):
            total += 1

            if total % 10000 == 0:
                print '...', total, 'reads scanned'
                if total >= args.max_reads:
                    print 'reached upper limit of %d reads (see -M); exiting' \
                        % args.max_reads
                    break

            # collect first N reads
            if total <= args.num_reads:
                for n in range(num_samples):
                    reads[n].append(record)
            else:
                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, total)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = record

    # output all the subsampled reads:
    if len(reads) == 1:
        print 'Writing %d sequences to %s' % (len(reads[0]), output_filename)
        if not output_file:
            output_file = open(output_filename, 'w')

        for record in reads[0]:
            output_file.write(output_single(record))
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print 'Writing %d sequences to %s' % (len(reads[n]), n_filename)
            output_file = open(n_filename, 'w')
            for record in reads[n]:
                output_file.write(output_single(record))
Example #30
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('extract-partitions.py', ['graph'])
    args = get_parser().parse_args()

    distfilename = args.prefix + '.dist'

    n_unassigned = 0

    for infile in args.part_filenames:
        check_file_status(infile)

    check_space(args.part_filenames)

    print '---'
    print 'reading partitioned files:', repr(args.part_filenames)
    if args.output_groups:
        print 'outputting to files named "%s.groupN.fa"' % args.prefix
        print 'min reads to keep a partition:', args.min_part_size
        print 'max size of a group file:', args.max_size
    else:
        print 'NOT outputting groups! Beware!'

    if args.output_unassigned:
        print 'outputting unassigned reads to "%s.unassigned.fa"' % args.prefix

    print 'partition size distribution will go to %s' % distfilename
    print '---'

    #

    suffix = 'fa'
    is_fastq = False

    for index, read, pid in read_partition_file(args.part_filenames[0]):
        if hasattr(read, 'accuracy'):
            suffix = 'fq'
            is_fastq = True
        break

    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if is_fastq:
                assert hasattr(read, 'accuracy'), \
                    "all input files must be FASTQ if the first one is"
            else:
                assert not hasattr(read, 'accuracy'), \
                    "all input files must be FASTA if the first one is"

            break

    if args.output_unassigned:
        unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w')

    count = {}
    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if index % 100000 == 0:
                print '...', index

            count[pid] = count.get(pid, 0) + 1

            if pid == 0:
                n_unassigned += 1
                if args.output_unassigned:
                    print >>unassigned_fp, output_single(read)

    if args.output_unassigned:
        unassigned_fp.close()

    if 0 in count:                          # eliminate unpartitioned sequences
        del count[0]

    # develop histogram of partition sizes
    dist = {}
    for pid, size in count.items():
        dist[size] = dist.get(size, 0) + 1

    # output histogram
    distfp = open(distfilename, 'w')

    total = 0
    wtotal = 0
    for counter, index in sorted(dist.items()):
        total += index
        wtotal += counter * index
        distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal))
    distfp.close()

    if not args.output_groups:
        sys.exit(0)

    # sort groups by size
    divvy = sorted(count.items(), key=lambda y: y[1])
    divvy = [y for y in divvy if y[1] > args.min_part_size]

    # divvy up into different groups, based on having max_size sequences
    # in each group.
    total = 0
    group = set()
    group_n = 0
    group_d = {}
    for partition_id, n_reads in divvy:
        group.add(partition_id)
        total += n_reads

        if total > args.max_size:
            for partition_id in group:
                group_d[partition_id] = group_n
                # print 'group_d', partition_id, group_n

            group_n += 1
            group = set()
            total = 0

    if group:
        for partition_id in group:
            group_d[partition_id] = group_n
            # print 'group_d', partition_id, group_n
        group_n += 1

    print '%d groups' % group_n
    if group_n == 0:
        print 'nothing to output; exiting!'
        return

    # open a bunch of output files for the different groups
    group_fps = {}
    for _ in range(group_n):
        group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w')
        group_fps[_] = group_fp

    # write 'em all out!

    total_seqs = 0
    part_seqs = 0
    toosmall_parts = 0
    for filename in args.part_filenames:
        for index, read, partition_id in read_partition_file(filename):
            total_seqs += 1
            if index % 100000 == 0:
                print '...x2', index

            if partition_id == 0:
                continue

            try:
                group_n = group_d[partition_id]
            except KeyError:
                assert count[partition_id] <= args.min_part_size
                toosmall_parts += 1
                continue

            outfp = group_fps[group_n]

            outfp.write(output_single(read))
            part_seqs += 1

    print '---'
    print 'Of %d total seqs,' % total_seqs
    print 'extracted %d partitioned seqs into group files,' % part_seqs
    print 'discarded %d sequences from small partitions (see -m),' % \
        toosmall_parts
    print 'and found %d unpartitioned sequences (see -U).' % n_unassigned
    print ''
    print 'Created %d group files named %s.groupXXXX.%s' % (len(group_fps),
                                                            args.prefix,
                                                            suffix)
Example #31
0
def main():
    info('filter-abund-single.py', ['counting'])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print >> sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print >> sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >> sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >> sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >> sys.stderr, 'wrote to: ', outfile
Example #32
0
def main():
    info("find-knots.py", ["graph"])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + ".pt", graphbase + ".tagset"]
    if os.path.exists(graphbase + ".stoptags"):
        infiles.append(graphbase + ".stoptags")
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print >>sys.stderr, "loading k-mer presence table %s.pt" % graphbase
    htable = khmer.load_hashbits(graphbase + ".pt")

    print >>sys.stderr, "loading tagset %s.tagset..." % graphbase
    htable.load_tagset(graphbase + ".tagset")

    initial_stoptags = False  # @CTB regularize with make-initial
    if os.path.exists(graphbase + ".stoptags"):
        print >>sys.stderr, "loading stoptags %s.stoptags" % graphbase
        htable.load_stop_tags(graphbase + ".stoptags")
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + ".subset.*.pmap")

    print >>sys.stderr, "loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0])
    print >>sys.stderr, "---"
    print >>sys.stderr, "output stoptags will be in", graphbase + ".stoptags"
    if initial_stoptags:
        print >>sys.stderr, "(these output stoptags will include the already-loaded set)"
    print >>sys.stderr, "---"

    # create counting hash
    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print >>sys.stderr, "<-", subset_file
        subset = htable.load_subset_partitionmap(subset_file)

        print >>sys.stderr, "** repartitioning subset... %s" % subset_file
        htable.repartition_largest_partition(
            subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD
        )

        print >>sys.stderr, "** merging subset... %s" % subset_file
        htable.merge_subset(subset)

        print >>sys.stderr, "** repartitioning, round 2... %s" % subset_file
        size = htable.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD
        )

        print >>sys.stderr, "** repartitioned size:", size

        print >>sys.stderr, "saving stoptags binary"
        htable.save_stop_tags(graphbase + ".stoptags")
        os.rename(subset_file, subset_file + ".processed")
        print >>sys.stderr, "(%d of %d)\n" % (index, len(pmap_files))

    print >>sys.stderr, "done!"
Example #33
0
def main():
    info('find-knots.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print >> sys.stderr, 'loading k-mer presence table %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    print >> sys.stderr, 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    initial_stoptags = False  # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print >> sys.stderr, 'loading stoptags %s.stoptags' % graphbase
        htable.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \
        (len(pmap_files), pmap_files[0])
    print >> sys.stderr, '---'
    print >> sys.stderr, 'output stoptags will be in', graphbase + '.stoptags'
    if initial_stoptags:
        print >>sys.stderr, \
            '(these output stoptags will include the already-loaded set)'
    print >> sys.stderr, '---'

    # create counting hash
    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print >> sys.stderr, '<-', subset_file
        subset = htable.load_subset_partitionmap(subset_file)

        print >> sys.stderr, '** repartitioning subset... %s' % subset_file
        htable.repartition_largest_partition(subset, counting,
                                             EXCURSION_DISTANCE,
                                             EXCURSION_KMER_THRESHOLD,
                                             EXCURSION_KMER_COUNT_THRESHOLD)

        print >> sys.stderr, '** merging subset... %s' % subset_file
        htable.merge_subset(subset)

        print >> sys.stderr, '** repartitioning, round 2... %s' % subset_file
        size = htable.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
            EXCURSION_KMER_COUNT_THRESHOLD)

        print >> sys.stderr, '** repartitioned size:', size

        print >> sys.stderr, 'saving stoptags binary'
        htable.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print >> sys.stderr, '(%d of %d)\n' % (index, len(pmap_files))

    print >> sys.stderr, 'done!'
Example #34
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0
    input_filename = None

    for index, input_filename in enumerate(args.input_filenames):
        if args.single_output_filename != '':
            output_name = args.single_output_filename
            outfp = open(args.single_output_filename, 'a')
        else:
            output_name = os.path.basename(input_filename) + '.keep'
            outfp = open(output_name, 'w')

        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename,
                                                           outfp, htable, args,
                                                           report_fp)
        except IOError as err:
            handle_error(err, output_name, input_filename, args.fail_save,
                         htable)
            if not args.force:
                print >> sys.stderr, '** Exiting!'

                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                      .format(inp=input_filename, kept=total - discarded,
                              total=total, perc=int(100. - discarded /
                                                    float(total) * 100.))
                print 'output in', output_name

        if (args.dump_frequency > 0 and
                index > 0 and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        if not args.force:
            sys.exit(1)
Example #35
0
def main():
    info('extract-paired-reads.py')
    args = get_parser().parse_args()

    check_file_status(args.infile, args.force)
    infiles = [args.infile]
    check_space(infiles, args.force)

    outfile = os.path.basename(args.infile)
    if len(sys.argv) > 2:
        outfile = sys.argv[2]

    single_fp = open(outfile + '.se', 'w')
    paired_fp = open(outfile + '.pe', 'w')

    print >>sys.stderr, 'reading file "%s"' % args.infile
    print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile
    print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile

    last_record = None
    last_name = None

    n_pe = 0
    n_se = 0

    record = None
    index = 0
    for index, record in enumerate(screed.open(sys.argv[1])):
        if index % 100000 == 0 and index > 0:
            print '...', index
        name = record['name'].split()[0]

        if last_record:
            if is_pair(last_name, name):
                paired_fp.write(output_pair(last_record, record))
                name, record = None, None
                n_pe += 1
            else:
                single_fp.write(output_single(last_record))
                n_se += 1

        last_name = name
        last_record = record

    if last_record:
        if is_pair(last_name, name):
            paired_fp.write(output_pair(last_record, record))
            name, record = None, None
            n_pe += 1
        else:
            single_fp.write(output_single(last_record))
            name, record = None, None
            n_se += 1

    if record:
        single_fp.write(output_single(record))
        n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print >>sys.stderr, 'DONE; read %d sequences,' \
        ' %d pairs and %d singletons' % \
        (index + 1, n_pe, n_se)

    print >> sys.stderr, 'wrote to: ' + outfile \
        + '.se' + ' and ' + outfile + '.pe'
Example #36
0
def main():
    info('filter-abund-single.py', ['counting'])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print >>sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables,
                                     args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print >>sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print >>sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >>sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >>sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >>sys.stderr, 'wrote to: ', outfile
Example #37
0
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_file_status(_)

    check_space(args.filenames)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'
        output_file = open(output_filename, 'w')

    print 'Subsampling %d reads using reservoir sampling.' % args.num_reads
    print 'Subsampled reads will be placed in %s' % output_filename
    print ''

    reads = []
    total = 0

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print 'opening', filename, 'for reading'
        for record in screed.open(filename):
            total += 1

            if total % 10000 == 0:
                print '...', total, 'reads scanned'
                if total >= args.max_reads:
                    print 'reached upper limit of %d reads (see -M); exiting' \
                        % args.max_reads
                    break

            # collect first N reads
            if total <= args.num_reads:
                reads.append(record)
            else:
                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling
                guess = random.randint(1, total)
                if guess <= args.num_reads:
                    reads[guess - 1] = record

    # output all the subsampled reads:
    for record in reads:
        output_file.write(output_single(record))

    print ''
    print 'wrote %d reads to %s' % (len(reads), output_filename)
Example #38
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_file_status(args.input_sequence_filename)
    check_space([args.input_sequence_filename])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)

    if (not args.squash_output and
            os.path.exists(args.output_histogram_filename)):
        print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \
            args.output_histogram_filename
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')

    print >>sys.stderr, 'making k-mer counting table'
    counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                            args.n_tables,
                                            args.threads)
    counting_hash.set_use_bigcount(args.bigcount)

    print >> sys.stderr, 'building k-mer tracking table'
    tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
                                  args.n_tables)

    print >>sys.stderr, 'kmer_size:', counting_hash.ksize()
    print >>sys.stderr, 'k-mer counting table sizes:', \
        counting_hash.hashsizes()
    print >>sys.stderr, 'outputting to', args.output_histogram_filename

    khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print >>sys.stderr, 'consuming input, round 1 --', \
        args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            counting_hash.n_unique_kmers())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print >>sys.stderr, 'preparing hist from %s...' % \
        args.input_sequence_filename
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print >>sys.stderr, 'consuming input, round 2 --', \
        args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print >> sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> hist_fp, _, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table ', args.savetable
        print >>sys.stderr, '...saving to', args.savetable
        counting_hash.save(args.savetable)

    print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
Example #39
0
def main():
    info('partition-graph.py', ['graph'])
    args = get_parser().parse_args()
    basename = args.basename

    filenames = [basename + '.pt', basename + '.tagset']
    for _ in filenames:
        check_file_status(_)

    check_space(filenames)

    print >> sys.stderr, '--'
    print >> sys.stderr, 'SUBSET SIZE', args.subset_size
    print >> sys.stderr, 'N THREADS', args.threads
    if args.stoptags:
        print >> sys.stderr, 'stoptag file:', args.stoptags
    print >> sys.stderr, '--'

    print >> sys.stderr, 'loading ht %s.pt' % basename
    htable = khmer.load_hashbits(basename + '.pt')
    htable.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >> sys.stderr, 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print >>sys.stderr, '** This script brakes for lumps:', \
                            ' stop_big_traversals is true.'
    else:
        print >>sys.stderr, '** Traverse all the things:', \
                            ' stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = htable.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((htable, _, start, end))

    print >> sys.stderr, 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print >> sys.stderr, 'starting %d threads' % n_threads
    print >> sys.stderr, '---'

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker,
                                    args=(worker_q, basename,
                                          stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print >> sys.stderr, 'done starting threads'

    # wait for threads
    for _ in threads:
        _.join()

    print >> sys.stderr, '---'
    print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \
        (basename,)
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_file_status(_, args.force)

    check_space(args.filenames, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                sys.exit(1)
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' % \
            args.num_reads
        print >>sys.stderr, 'Subsampled reads will be placed in %s' % \
            output_filename
        print >>sys.stderr, ''
    else:  # > 1
        print >>sys.stderr, 'Subsampling %d reads, %d times,' \
            % (args.num_reads, num_samples), ' using reservoir sampling.'
        print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \
            % output_filename
        print >>sys.stderr, ''

    reads = []
    for n in range(num_samples):
        reads.append([])

    total = 0

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print >>sys.stderr, 'opening', filename, 'for reading'
        for record in screed.open(filename):
            total += 1

            if total % 10000 == 0:
                print >>sys.stderr, '...', total, 'reads scanned'
                if total >= args.max_reads:
                    print >>sys.stderr, 'reached upper limit of %d reads',\
                        ' (see -M); exiting' \
                        % args.max_reads
                    break

            # collect first N reads
            if total <= args.num_reads:
                for n in range(num_samples):
                    reads[n].append(record)
            else:
                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, total)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = record

    # output all the subsampled reads:
    if len(reads) == 1:
        print >>sys.stderr, 'Writing %d sequences to %s' % \
            (len(reads[0]), output_filename)
        if not output_file:
            output_file = open(output_filename, 'w')

        for record in reads[0]:
            output_file.write(output_single(record))
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print >>sys.stderr, 'Writing %d sequences to %s' % \
                (len(reads[n]), n_filename)
            output_file = open(n_filename, 'w')
            for record in reads[n]:
                output_file.write(output_single(record))
Example #41
0
def main():
    info('load-graph.py', ['graph'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.)

    print 'Saving k-mer presence table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(n_threads):
            cur_thrd = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

    print 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set.  Increase table size/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)
Example #42
0
def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = get_parser()
    args = parser.parse_args()

    if args.min_tablesize < MIN_HSIZE:
        args.min_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype='hashbits')

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_file_status(args.input_fastp)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = ix.next()
    del ix

    extension = 'fa'
    if hasattr(record, 'accuracy'):      # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(
        max_buffers, max_reads, buf_size, output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print >>sys.stderr, 'consuming input sequences...'
        if args.label_by_pid:
            print >>sys.stderr, '...labeling by partition id (pid)'
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print >>sys.stderr, '...labeling by sequence'
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print >>sys.stderr, \
                        '...consumed {n} sequences...'.format(n=n)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print >>sys.stderr, \
                '...labeling to create groups of size {s}'.format(
                    s=args.group_size)
            label = -1
            g = 0
            try:
                outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                            g=g,
                                                            ext=extension
                                                            ), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open('{pref}_base_{g}.{ext}'.format(
                                pref=output_pref, g=g,
                                ext=extension), 'wb')
                    if n % 50000 == 0:
                        print >>sys.stderr, \
                            '...consumed {n} sequences...'.format(n=n)
                    ht.consume_sequence_and_tag_with_labels(record.sequence,
                                                            label)

                    if hasattr(record, 'accuracy'):
                        outfp.write('@{name}\n{seq}+{accuracy}\n'.format(
                            name=record.name,
                            seq=record.sequence,
                            accuracy=record.accuracy))
                    else:
                        outfp.write('>{name}\n{seq}\n'.format(
                            name=record.name,
                            seq=record.sequence))

            except IOError as e:
                print >>sys.stderr, '!! ERROR !!', e
                print >>sys.stderr, '...error splitting input. exiting...'

    except IOError as e:
        print >>sys.stderr, '!! ERROR: !!', e
        print >>sys.stderr, '...error consuming \
                            {i}. exiting...'.format(i=input_fastp)

    print >>sys.stderr, 'done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.n_tags(), l=ht.n_labels())

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print >>sys.stderr, '** sweeping {read_file} for labels...'.format(
            read_file=read_file)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print >>sys.stderr, '!! ERROR: !!', error
            print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(
                fn=read_file)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'accuracy'):
                        seq_str = fmt_fastq(name, seq, record.accuracy, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print >>sys.stderr, '** End of run...'
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print >>sys.stderr, '! WARNING: Sweep finished with errors !'
        print >>sys.stderr, '** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors)
        print >>sys.stderr, '** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors)

    print >>sys.stderr, 'swept {n_reads} for labels...'.format(
        n_reads=n_labeled + n_orphaned)
    print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
        nc=n_labeled, no=n_orphaned)
    print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)

    print >>sys.stderr, '** outputting label number distribution...'
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'wb') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print >>sys.stderr, '** outputting label read counts...'
    with open(fn, 'wb') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
Example #43
0
def main():
    info('load-graph.py', ['graph'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.)

    print 'Saving k-mer presence table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(n_threads):
            cur_thrd = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if args.write_fp_rate:
        print >> info_fp, \
            '\nfalse positive rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set. Increase table size/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)
Example #44
0
def main():
    info('partition-graph.py', ['graph'])
    args = get_parser().parse_args()
    basename = args.basename

    filenames = [basename + '.pt', basename + '.tagset']
    for _ in filenames:
        check_file_status(_)

    check_space(filenames)

    print >>sys.stderr, '--'
    print >>sys.stderr, 'SUBSET SIZE', args.subset_size
    print >>sys.stderr, 'N THREADS', args.threads
    if args.stoptags:
        print >>sys.stderr, 'stoptag file:', args.stoptags
    print >>sys.stderr, '--'

    print >>sys.stderr, 'loading ht %s.pt' % basename
    htable = khmer.load_hashbits(basename + '.pt')
    htable.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print >>sys.stderr, 'loading stoptags from', args.stoptags
        htable.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print >>sys.stderr, '** This script brakes for lumps:', \
                            ' stop_big_traversals is true.'
    else:
        print >>sys.stderr, '** Traverse all the things:', \
                            ' stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = htable.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((htable, _, start, end))

    print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print >>sys.stderr, 'starting %d threads' % n_threads
    print >>sys.stderr, '---'

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker, args=(worker_q, basename,
                                                         stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print >>sys.stderr, 'done starting threads'

    # wait for threads
    for _ in threads:
        _.join()

    print >>sys.stderr, '---'
    print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \
        (basename,)
Example #45
0
def main():
    info("load-graph.py", ["graph"])
    args = get_parser().parse_args()
    report_on_config(args, hashtype="hashbits")

    base = args.output_filename
    filenames = args.input_filenames

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.0)

    print >>sys.stderr, "Saving k-mer presence table to %s" % base
    print >>sys.stderr, "Loading kmers from sequences in %s" % repr(filenames)
    if args.no_build_tagset:
        print >>sys.stderr, "We WILL NOT build the tagset."
    else:
        print >>sys.stderr, "We WILL build the tagset", " (for partitioning/traversal)."

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print >>sys.stderr, "making k-mer presence table"
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, 1)
        print >>sys.stderr, "consuming input", filename
        target_method(rparser)

    if args.report_total_kmers:
        print >>sys.stderr, "Total number of unique k-mers: {0}".format(htable.n_unique_kmers())

    print >>sys.stderr, "saving k-mer presence table in", base + ".pt"
    htable.save(base + ".pt")

    if not args.no_build_tagset:
        print >>sys.stderr, "saving tagset in", base + ".tagset"
        htable.save_tagset(base + ".tagset")

    info_fp = open(base + ".info", "w")
    info_fp.write("%d unique k-mers" % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >>sys.stderr, "fp rate estimated to be %1.3f" % fp_rate
    if args.write_fp_rate:
        print >> info_fp, "\nfalse positive rate estimated to be %1.3f" % fp_rate

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >>sys.stderr, "**"
        print >>sys.stderr, (
            "** ERROR: the graph structure is too small for " "this data set. Increase table size/# tables."
        )
        print >>sys.stderr, "**"
        sys.exit(1)

    print >>sys.stderr, "wrote to", base + ".info and", base + ".pt"
    if not args.no_build_tagset:
        print >>sys.stderr, "and " + base + ".tagset"
Example #46
0
def main():

    info('load-into-counting.py', ['counting'])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print >>sys.stderr, 'Saving k-mer counting table to %s' % base
    print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print >>sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    filename = None

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.threads)
        threads = []
        print >>sys.stderr, 'consuming input', filename
        for _ in xrange(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print >>sys.stderr, 'mid-save', base
            htable.save(base)
        with open(base + '.info', 'a') as info_fh:
            print >> info_fh, 'through', filename

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
        with open(base + '.info', 'a') as info_fp:
            print >>info_fp, 'Total number of unique k-mers:', n_kmers

    print >>sys.stderr, 'saving', base
    htable.save(base)

    fp_rate = khmer.calc_expected_collisions(htable)

    with open(base + '.info', 'a') as info_fp:
        print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print >> sys.stderr, "Writing summmary info to", mr_file
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.1.0",
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
                mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
                    b=os.path.basename(base), fpr=fp_rate, k=n_kmers,
                    fls=";".join(filenames)))

    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # Change 0.2 only if you really grok it.  HINT: You don't.
    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the k-mer counting table is too small",
        print >> sys.stderr, "for this data set. Increase tablesize/# tables."
        print >> sys.stderr, "**"
        sys.exit(1)

    print >>sys.stderr, 'DONE.'
    print >>sys.stderr, 'wrote to:', base + '.info'
Example #47
0
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_file_status(_)

    check_space(args.filenames)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'
        output_file = open(output_filename, 'w')

    print 'Subsampling %d reads using reservoir sampling.' % args.num_reads
    print 'Subsampled reads will be placed in %s' % output_filename
    print ''

    reads = []
    total = 0

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print 'opening', filename, 'for reading'
        for record in screed.open(filename):
            total += 1

            if total % 10000 == 0:
                print '...', total, 'reads scanned'
                if total >= args.max_reads:
                    print 'reached upper limit of %d reads (see -M); exiting' \
                        % args.max_reads
                    break

            # collect first N reads
            if total <= args.num_reads:
                reads.append(record)
            else:
                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling
                guess = random.randint(1, total)
                if guess <= args.num_reads:
                    reads[guess - 1] = record

    # output all the subsampled reads:
    for record in reads:
        output_file.write(output_single(record))

    print ''
    print 'wrote %d reads to %s' % (len(reads), output_filename)
Example #48
0
def main():
    info('find-knots.py', ['graph'])
    args = get_parser().parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase + '.pt', graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print 'loading k-mer presence table %s.pt' % graphbase
    htable = khmer.load_hashbits(graphbase + '.pt')

    print 'loading tagset %s.tagset...' % graphbase
    htable.load_tagset(graphbase + '.tagset')

    initial_stoptags = False    # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print 'loading stoptags %s.stoptags' % graphbase
        htable.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])
    print '---'
    print 'output stoptags will be in', graphbase + '.stoptags'
    if initial_stoptags:
        print '(these output stoptags will include the already-loaded set)'
    print '---'

    # create counting hash
    ksize = htable.ksize()
    counting = khmer.new_counting_hash(ksize, args.min_tablesize,
                                       args.n_tables)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print '<-', subset_file
        subset = htable.load_subset_partitionmap(subset_file)

        print '** repartitioning subset... %s' % subset_file
        htable.repartition_largest_partition(subset, counting,
                                             EXCURSION_DISTANCE,
                                             EXCURSION_KMER_THRESHOLD,
                                             EXCURSION_KMER_COUNT_THRESHOLD)

        print '** merging subset... %s' % subset_file
        htable.merge_subset(subset)

        print '** repartitioning, round 2... %s' % subset_file
        size = htable.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
            EXCURSION_KMER_COUNT_THRESHOLD)

        print '** repartitioned size:', size

        print 'saving stoptags binary'
        htable.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print '(%d of %d)\n' % (index, len(pmap_files))

    print 'done!'
Example #49
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('extract-partitions.py', ['graph'])
    args = get_parser().parse_args()

    distfilename = args.prefix + '.dist'

    n_unassigned = 0

    for infile in args.part_filenames:
        check_file_status(infile)

    check_space(args.part_filenames)

    print '---'
    print 'reading partitioned files:', repr(args.part_filenames)
    if args.output_groups:
        print 'outputting to files named "%s.groupN.fa"' % args.prefix
        print 'min reads to keep a partition:', args.min_part_size
        print 'max size of a group file:', args.max_size
    else:
        print 'NOT outputting groups! Beware!'

    if args.output_unassigned:
        print 'outputting unassigned reads to "%s.unassigned.fa"' % args.prefix

    print 'partition size distribution will go to %s' % distfilename
    print '---'

    #

    suffix = 'fa'
    is_fastq = False

    for index, read, pid in read_partition_file(args.part_filenames[0]):
        if hasattr(read, 'accuracy'):
            suffix = 'fq'
            is_fastq = True
        break

    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if is_fastq:
                assert hasattr(read, 'accuracy'), \
                    "all input files must be FASTQ if the first one is"
            else:
                assert not hasattr(read, 'accuracy'), \
                    "all input files must be FASTA if the first one is"

            break

    if args.output_unassigned:
        unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w')

    count = {}
    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if index % 100000 == 0:
                print '...', index

            count[pid] = count.get(pid, 0) + 1

            if pid == 0:
                n_unassigned += 1
                if args.output_unassigned:
                    print >> unassigned_fp, output_single(read)

    if args.output_unassigned:
        unassigned_fp.close()

    if 0 in count:  # eliminate unpartitioned sequences
        del count[0]

    # develop histogram of partition sizes
    dist = {}
    for pid, size in count.items():
        dist[size] = dist.get(size, 0) + 1

    # output histogram
    distfp = open(distfilename, 'w')

    total = 0
    wtotal = 0
    for counter, index in sorted(dist.items()):
        total += index
        wtotal += counter * index
        distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal))
    distfp.close()

    if not args.output_groups:
        sys.exit(0)

    # sort groups by size
    divvy = sorted(count.items(), key=lambda y: y[1])
    divvy = [y for y in divvy if y[1] > args.min_part_size]

    # divvy up into different groups, based on having max_size sequences
    # in each group.
    total = 0
    group = set()
    group_n = 0
    group_d = {}
    for partition_id, n_reads in divvy:
        group.add(partition_id)
        total += n_reads

        if total > args.max_size:
            for partition_id in group:
                group_d[partition_id] = group_n
                # print 'group_d', partition_id, group_n

            group_n += 1
            group = set()
            total = 0

    if group:
        for partition_id in group:
            group_d[partition_id] = group_n
            # print 'group_d', partition_id, group_n
        group_n += 1

    print '%d groups' % group_n
    if group_n == 0:
        print 'nothing to output; exiting!'
        return

    # open a bunch of output files for the different groups
    group_fps = {}
    for _ in range(group_n):
        group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w')
        group_fps[_] = group_fp

    # write 'em all out!

    total_seqs = 0
    part_seqs = 0
    toosmall_parts = 0
    for filename in args.part_filenames:
        for index, read, partition_id in read_partition_file(filename):
            total_seqs += 1
            if index % 100000 == 0:
                print '...x2', index

            if partition_id == 0:
                continue

            try:
                group_n = group_d[partition_id]
            except KeyError:
                assert count[partition_id] <= args.min_part_size
                toosmall_parts += 1
                continue

            outfp = group_fps[group_n]

            outfp.write(output_single(read))
            part_seqs += 1

    print '---'
    print 'Of %d total seqs,' % total_seqs
    print 'extracted %d partitioned seqs into group files,' % part_seqs
    print 'discarded %d sequences from small partitions (see -m),' % \
        toosmall_parts
    print 'and found %d unpartitioned sequences (see -U).' % n_unassigned
    print ''
    print 'Created %d group files named %s.groupXXXX.%s' % (
        len(group_fps), args.prefix, suffix)
Example #50
0
def main():
    info("sweep-reads-buffered.py", ["sweep"])
    parser = get_parser()
    args = parser.parse_args()

    if args.min_tablesize < MIN_HSIZE:
        args.min_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype="hashbits")

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_file_status(args.input_fastp)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)
    # Check disk space availability
    check_space(all_input_files)

    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print >>sys.stderr, "consuming input sequences..."
        if args.label_by_pid:
            print >>sys.stderr, "...labeling by partition id (pid)"
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print >>sys.stderr, "...labeling by sequence"
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print >>sys.stderr, "...consumed {n} sequences...".format(n=n)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print >>sys.stderr, "...labeling to create groups of size {s}".format(s=args.group_size)
            label = -1
            g = 0
            try:
                outfp = open("{pref}_base_{g}.fa".format(pref=output_pref, g=g), "wb")
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open("{pref}_base_{g}.fa".format(pref=output_pref, g=g), "wb")
                    if n % 50000 == 0:
                        print >>sys.stderr, "...consumed {n} sequences...".format(n=n)
                    ht.consume_sequence_and_tag_with_labels(record.sequence, label)
                    outfp.write(">{name}\n{seq}\n".format(name=record.name, seq=record.sequence))

            except IOError as e:
                print >>sys.stderr, "!! ERROR !!", e
                print >>sys.stderr, "...error splitting input. exiting..."

    except IOError as e:
        print >>sys.stderr, "!! ERROR: !!", e
        print >>sys.stderr, "...error consuming \
                            {i}. exiting...".format(
            i=input_fastp
        )

    print >>sys.stderr, "done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...".format(
        t=ht.n_tags(), l=ht.n_labels()
    )

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print >>sys.stderr, "** sweeping {read_file} for labels...".format(read_file=read_file)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print >>sys.stderr, "!! ERROR: !!", error
            print >>sys.stderr, "*** Could not open {fn}, skipping...".format(fn=read_file)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print >>sys.stderr, "\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)".format(
                        n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t
                    )
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, "multi")
                            n_mlabeled += 1
                            label_dict["multi"] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, "orphaned")
                        label_dict["orphaned"] += 1
            print >>sys.stderr, "** End of file {fn}...".format(fn=read_file)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print >>sys.stderr, "** End of run..."
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print >>sys.stderr, "! WARNING: Sweep finished with errors !"
        print >>sys.stderr, "** {writee} reads not written".format(writee=output_buffer.num_write_errors)
        print >>sys.stderr, "** {filee} errors opening files".format(filee=output_buffer.num_file_errors)

    print >>sys.stderr, "swept {n_reads} for labels...".format(n_reads=n_labeled + n_orphaned)
    print >>sys.stderr, "...with {nc} labeled and {no} orphaned".format(nc=n_labeled, no=n_orphaned)
    print >>sys.stderr, "...and {nmc} multilabeled".format(nmc=n_mlabeled)

    print >>sys.stderr, "** outputting label number distribution..."
    fn = os.path.join(outdir, "{pref}.dist.txt".format(pref=output_pref))
    with open(fn, "wb") as outfp:
        for nc in label_number_dist:
            outfp.write("{nc}\n".format(nc=nc))

    fn = os.path.join(outdir, "{pref}.counts.csv".format(pref=output_pref))
    print >>sys.stderr, "** outputting label read counts..."
    with open(fn, "wb") as outfp:
        for k in label_dict:
            outfp.write("{l},{c}\n".format(l=k, c=label_dict[k]))
Example #51
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    info('do-partition.py', ['graph'])
    args = get_parser().parse_args()

    report_on_config(args, hashtype='hashbits')

    for infile in args.input_filenames:
        check_file_status(infile)

    check_space(args.input_filenames)

    print 'Saving k-mer presence table to %s' % args.graphbase
    print 'Loading kmers from sequences in %s' % repr(args.input_filenames)

    print '--'
    print 'SUBSET SIZE', args.subset_size
    print 'N THREADS', args.n_threads
    print '--'

    # load-graph

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    for _, filename in enumerate(args.input_filenames):
        print 'consuming input', filename
        htable.consume_fasta_and_tag(filename)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for"
                              " this data set.  Increase k-mer presence table "
                              "size/num of tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print '** This script brakes for lumps: stop_big_traversals is true.'
    else:
        print '** Traverse all the things: stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = htable.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((htable, _, start, end))

    print 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % args.graphbase, 'w').write('%d subsets total\n'
                                                % (n_subsets))

    if n_subsets < args.n_threads:
        args.n_threads = n_subsets

    # start threads!
    print 'starting %d threads' % args.n_threads
    print '---'

    threads = []
    for _ in range(args.n_threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    print 'done starting threads'

    # wait for threads
    for _ in threads:
        _.join()

    print '---'
    print 'done making subsets! see %s.subset.*.pmap' % (args.graphbase,)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])

    htable = khmer.new_hashbits(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print 'merging', pmap_file
        htable.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print 'removing pmap files'
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print 'outputting partitions for', infile
        outfile = os.path.basename(infile) + '.part'
        part_count = htable.output_partitions(infile, outfile)
        print 'output %d partitions for %s' % (part_count, infile)
        print 'partitions are in', outfile
Example #52
0
def main():

    info('collect-reads.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading sequences from %s' % repr(filenames)
    if args.output:
        print 'Outputting sequences to', args.output

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if 'N' in seq:
                seq = seq.replace('N', 'G')

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print 'reached target average coverage:', \
                      total_coverage / float(n)
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print '...', index, filename, n, total_coverage / float(n)

        if total_coverage / float(n) > args.coverage:
            break

    print 'Collected %d reads' % (n, )

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
Example #53
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    info('do-partition.py', ['graph'])
    args = get_parser().parse_args()

    report_on_config(args, hashtype='hashbits')

    for infile in args.input_filenames:
        check_file_status(infile)

    check_space(args.input_filenames)

    print 'Saving k-mer presence table to %s' % args.graphbase
    print 'Loading kmers from sequences in %s' % repr(args.input_filenames)

    print '--'
    print 'SUBSET SIZE', args.subset_size
    print 'N THREADS', args.n_threads
    print '--'

    # load-graph

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    for _, filename in enumerate(args.input_filenames):
        print 'consuming input', filename
        htable.consume_fasta_and_tag(filename)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for"
                              " this data set.  Increase k-mer presence table "
                              "size/num of tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print '** This script brakes for lumps: stop_big_traversals is true.'
    else:
        print '** Traverse all the things: stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = htable.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((htable, _, start, end))

    print 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % args.graphbase,
         'w').write('%d subsets total\n' % (n_subsets))

    if n_subsets < args.n_threads:
        args.n_threads = n_subsets

    # start threads!
    print 'starting %d threads' % args.n_threads
    print '---'

    threads = []
    for _ in range(args.n_threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    print 'done starting threads'

    # wait for threads
    for _ in threads:
        _.join()

    print '---'
    print 'done making subsets! see %s.subset.*.pmap' % (args.graphbase, )

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])

    htable = khmer.new_hashbits(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print 'merging', pmap_file
        htable.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print 'removing pmap files'
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print 'outputting partitions for', infile
        outfile = os.path.basename(infile) + '.part'
        part_count = htable.output_partitions(infile, outfile)
        print 'output %d partitions for %s' % (part_count, infile)
        print 'partitions are in', outfile
Example #54
0
def main():

    info('collect-reads.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading sequences from %s' % repr(filenames)
    if args.output:
        print 'Outputting sequences to', args.output

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if 'N' in seq:
                seq = seq.replace('N', 'G')

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print 'reached target average coverage:', \
                      total_coverage / float(n)
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print '...', index, filename, n, total_coverage / float(n)

        if total_coverage / float(n) > args.coverage:
            break

    print 'Collected %d reads' % (n,)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'