Beispiel #1
0
def main():
    parser = build_construct_args()
    parser.add_argument('-l',
                        '--lower-cutoff',
                        type=int,
                        dest='lower_cutoff',
                        default=DEFAULT_LOWER_CUTOFF)
    parser.add_argument('-u',
                        '--upper-cutoff',
                        type=int,
                        dest='upper_cutoff',
                        default=DEFAULT_UPPER_CUTOFF)

    parser.add_argument('output_filename')
    parser.add_argument('input_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >>sys.stderr, "** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >> sys.stderr, ''
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes " \
            "(n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    output = args.output_filename
    input = args.input_filename

    print 'lower cutoff:', args.lower_cutoff
    print 'upper cutoff:', args.upper_cutoff
    print 'Saving stoptags to %s' % output
    print 'Loading sequences in %s' % input

    ###

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)

    print 'consuming input', input
    hb = ht.collect_high_abundance_kmers(input, args.lower_cutoff,
                                         args.upper_cutoff)

    print 'saving stoptags', output
    hb.save_stop_tags(output)
def main():
    parser = build_construct_args()
    add_threading_args(parser)

    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, count.
    total = 0
    total_unique = 0
    for n, record in enumerate(screed.open(filename)):
        total += 1
        last_kmer = record.sequence[-K:]
        count = ht.get(last_kmer)
        if count == 1:
            total_unique += 1

    print 'singletons: %d unique; of %d total; %.3f' % \
        (total_unique, total, total_unique/float(total))
def main():
    parser = build_construct_args()
    add_threading_args(parser)

    parser.add_argument('datafile')
    
    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile
    
    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, count.
    total = 0
    total_unique = 0
    for n, record in enumerate(screed.open(filename)):
        total += 1
        last_kmer = record.sequence[-K:]
        count = ht.get(last_kmer)
        if count == 1:
            total_unique += 1

    print 'singletons: %d unique; of %d total; %.3f' % \
        (total_unique, total, total_unique/float(total))
Beispiel #4
0
def main():
    parser = build_construct_args()
    parser.add_argument('-l', '--lower-cutoff', type=int, dest='lower_cutoff',
                        default=DEFAULT_LOWER_CUTOFF)
    parser.add_argument('-u', '--upper-cutoff', type=int, dest='upper_cutoff',
                        default=DEFAULT_UPPER_CUTOFF)

    parser.add_argument('output_filename')
    parser.add_argument('input_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >>sys.stderr, "** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"

        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >>sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes " \
            "(n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    output = args.output_filename
    input = args.input_filename

    print 'lower cutoff:', args.lower_cutoff
    print 'upper cutoff:', args.upper_cutoff
    print 'Saving stoptags to %s' % output
    print 'Loading sequences in %s' % input

    ###

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)

    print 'consuming input', input
    hb = ht.collect_high_abundance_kmers(input,
                                         args.lower_cutoff,
                                         args.upper_cutoff)

    print 'saving stoptags', output
    hb.save_stop_tags(output)
Beispiel #5
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('-R', '--report-to-file', dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print>>sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print>>sys.stderr, '\nPARAMETERS:'
        print>>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print>>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print>>sys.stderr, ''
        print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
        print>>sys.stderr, '-'*8

    K=args.ksize
    HT_SIZE=args.min_hashsize
    N_HT=args.n_hashes
    DESIRED_COVERAGE=args.cutoff
    report_fp = args.report_file

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

	n = -1
        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 100000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

                if report_fp:
                    print>>report_fp, total, total - discarded, \
                        1. - (discarded / float(total))
                    report_fp.flush()

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.replace('N', 'A')
            med, _, _ = ht.get_median_count(seq)

            if med < DESIRED_COVERAGE:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

	if -1 < n:
	    print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
		total, 'or', int(100. - discarded / float(total) * 100.), '%'
	    print 'output in', output_name
	else: print 'SKIPPED empty file', input_filename

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Beispiel #6
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('-R', '--report-to-file', dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print>>sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print>>sys.stderr, '\nPARAMETERS:'
        print>>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print>>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print>>sys.stderr, ' - paired =	      %s \t\t(-p)' % args.paired
        print>>sys.stderr, ''
        print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
        print>>sys.stderr, '-'*8

    K=args.ksize
    HT_SIZE=args.min_hashsize
    N_HT=args.n_hashes
    DESIRED_COVERAGE=args.cutoff
    report_fp = args.report_file
    filenames = args.input_filenames

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0

    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

	n = -1
        for n, batch in enumerate(batchwise(screed.open(input_filename), batch_size)):
            if n > 0 and n % 100000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

                if report_fp:
                    print>>report_fp, total, total - discarded, \
                        1. - (discarded / float(total))
                    report_fp.flush()

            total += batch_size

            # If in paired mode, check that the reads are properly interleaved
            if args.paired:
                if not validpair(batch[0], batch[1]):
                    print >>sys.stderr, 'Error: Improperly interleaved pairs %s %s' % (batch[0].name, batch[1].name)
                    sys.exit(-1)

            # Emit the batch of reads if any read passes the filter
            # and all reads are longer than K
            passed_filter = False
            passed_length = True
            for record in batch:
                if len(record.sequence) < K:
                    passed_length = False
                    continue

                seq = record.sequence.replace('N', 'A')
                med, _, _ = ht.get_median_count(seq)

                if med < DESIRED_COVERAGE:
                    ht.consume(seq)
                    passed_filter = True
            
            # Emit records if any passed
            if passed_length and passed_filter:
                for record in batch:
                    if hasattr(record,'accuracy'):
                        outfp.write('@%s\n%s\n+\n%s\n' % (record.name, 
                                                          record.sequence, 
                                                          record.accuracy))
                    else:
                        outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += batch_size

	if -1 < n:
	    print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
		total, 'or', int(100. - discarded / float(total) * 100.), '%'
	    print 'output in', output_name
	else: print 'SKIPPED empty file', input_filename

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Beispiel #7
0
def main():
    parser = build_construct_args()
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print>>sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print>>sys.stderr, '\nPARAMETERS:'
        print>>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print>>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print>>sys.stderr, ''
        print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
        print>>sys.stderr, '-'*8

    K=args.ksize
    HT_SIZE=args.min_hashsize
    N_HT=args.n_hashes

    inputlist = args.input_filenames
    readsfile = args.read_filename

    query_list = []
    for n, inp_name in enumerate(inputlist):
        # create a hashbits data structure
        # ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
        # create a counting hash
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

        outfile = os.path.basename(inp_name) + '.sweep3'
        outfp = open(outfile, 'w')
        query_list.append((ht, outfp))

    for n, inp_name in enumerate(inputlist):
        ht = query_list[n][0]

        # load contigs, connect into N partitions
        print 'loading input reads from', inp_name
        ht.consume_fasta(inp_name)

        # Change 0.2 only if you really grok it.  HINT: You don't.
        fp_rate = khmer.calc_expected_collisions(ht)
        print 'fp rate estimated to be %1.3f' % fp_rate

        if fp_rate > 0.20:
            print >>sys.stderr, "**"
            print >>sys.stderr, "** ERROR: the counting hash is too small for"
            print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
            print >>sys.stderr, "**"
            print >>sys.stderr, "** Do not use these results!!"
            sys.exit(-1)

    print 'starting sweep.'

    n = 0
    m = 0
    for n, record in enumerate(screed.open(readsfile)):
        if n % 10000 == 0:
            print '...', n, m

        for ht, outfp in query_list:
            count = ht.get_median_count(record.sequence)[0]
            if count:
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
def main():
    parser = build_construct_args(
        "Output k-mer abundance distribution (single file version).")
    add_threading_args(parser)

    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s', '--squash', dest='squash_output', default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savehash', dest='savehash', default='')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)
    
    datafile = args.datafile
    histout = args.histout

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    print 'building tracking ht'
    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 1 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    z_list = []
    def do_abundance_dist(r):
        z = ht.abundance_distribution_with_reads_parser(r, tracking)
        z_list.append(z)

    print 'preparing hist from %s...' % datafile
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 2 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=do_abundance_dist,
                args=(rparser,)
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    assert len(z_list) == n_threads, len(z_list)
    z = {}
    for zz in z_list:
        for i, count in enumerate(zz):
            z[i] = z.get(i, 0) + count

    total = sum(z.values())

    if 0 == total:
        print >>sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >>sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, 'w')

    sofar = 0
    for n, i in sorted(z.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >>fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
def main():
    parser = build_construct_args()
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('-R',
                        '--report-to-file',
                        dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('-f',
                        '--force-processing',
                        dest='force',
                        help='continue on next file if read errors are \
                         encountered',
                        action='store_true')
    parser.add_argument('-d',
                        '--dump-frequency',
                        dest='dump_frequency',
                        type=int,
                        help='dump hashtable every d files',
                        default=-1)
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash:
            print >>sys.stderr, \
                "** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, \
            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=args.ksize)
        print >>sys.stderr, \
            ' - n hashes =     {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes)
        print >>sys.stderr, \
            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize)
        print >> sys.stderr, ' - paired = {pr} \t\t(-p)'.format(pr=args.paired)
        print >> sys.stderr, ''
        print >>sys.stderr, \
            'Estimated memory usage is {prod:.2g} bytes \
            (n_hashes x min_hashsize)'                                      .format(prod=args.n_hashes
                                              * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    report_fp = args.report_file
    filenames = args.input_filenames
    force = args.force
    dump_frequency = args.dump_frequency

    # list to save error files along with throwing exceptions
    if force is True:
        corrupt_files = []

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0

    for n, input_filename in enumerate(filenames):
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(
                input_filename, outfp, ht, args, report_fp)
        except IOError as e:
            handle_error(e, output_name, input_filename, ht)
            if not force:
                print >> sys.stderr, '** Exiting!'
                sys.exit(-1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
                pass
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename,
                            kept=total - discarded, total=total,
                            perc=int(100. - discarded / float(total) * 100.))
                print 'output in', output_name

        if dump_frequency > 0 and n > 0 and n % dump_frequency == 0:
            print 'Backup: Saving hashfile through', input_filename
            if args.savehash:
                hashname = args.savehash
                print '...saving to', hashname
            else:
                hashname = 'backup.ht'
                print 'Nothing given for savehash, saving to', hashname
            ht.save(hashname)

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Beispiel #10
0
def main():
    parser = build_construct_args()
    parser.add_argument("-C", "--cutoff", type=int, dest="cutoff", default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument("-s", "--savehash", dest="savehash", default="")
    parser.add_argument("-l", "--loadhash", dest="loadhash", default="")
    parser.add_argument("input_filenames", nargs="+")

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash:
            print >> sys.stderr, "** WARNING: hashsize is default!  " "You absodefly want to increase this!\n** " "Please read the docs!"

        print >> sys.stderr, "\nPARAMETERS:"
        print >> sys.stderr, " - kmer size =    %d \t\t(-k)" % args.ksize
        print >> sys.stderr, " - n hashes =     %d \t\t(-N)" % args.n_hashes
        print >> sys.stderr, " - min hashsize = %-5.2g \t(-x)" % args.min_hashsize
        print >> sys.stderr, ""
        print >> sys.stderr, "Estimated memory usage is %.2g bytes " "(n_hashes x min_hashsize)" % (
            args.n_hashes * args.min_hashsize
        )
        print >> sys.stderr, "-" * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print "loading hashtable from", args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print "making hashtable"
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + ".minkeep"
        outfp = open(output_name, "w")

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print "... kept", total - discarded, "of", total, ", or", int(
                    100.0 - discarded / float(total) * 100.0
                ), "%"
                print "... in file", input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.replace("N", "A")
            mincount = ht.get_min_count(seq)

            if mincount < DESIRED_COVERAGE:
                ht.consume(seq)
                outfp.write(">%s\n%s\n" % (record.name, record.sequence))
            else:
                discarded += 1

        print "DONE with", input_filename, "; kept", total - discarded, "of", total, "or", int(
            100.0 - discarded / float(total) * 100.0
        ), "%"
        print "output in", output_name

    if args.savehash:
        print "Saving hashfile through", input_filename
        print "...saving to", args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print "fp rate estimated to be %1.3f" % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
def main():
    parser = build_construct_args()
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('-R', '--report-to-file', dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >>sys.stderr, \
                "** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"

        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, \
            ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >>sys.stderr, ' - paired =	      %s \t\t(-p)' % args.paired
        print >>sys.stderr, ''
        print >>sys.stderr, \
            'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' \
            % (args.n_hashes * args.min_hashsize)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff
    report_fp = args.report_file
    filenames = args.input_filenames

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0

    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

        n = -1
        for n, batch in enumerate(batchwise(screed.open(
                input_filename), batch_size)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

                if report_fp:
                    print>>report_fp, total, total - discarded, \
                        1. - (discarded / float(total))
                    report_fp.flush()

            total += batch_size

            # If in paired mode, check that the reads are properly interleaved
            if args.paired:
                if not validpair(batch[0], batch[1]):
                    print >>sys.stderr, \
                        'Error: Improperly interleaved pairs %s %s' \
                        % (batch[0].name, batch[1].name)
                    sys.exit(-1)

            # Emit the batch of reads if any read passes the filter
            # and all reads are longer than K
            passed_filter = False
            passed_length = True
            for record in batch:
                if len(record.sequence) < K:
                    passed_length = False
                    continue

                seq = record.sequence.replace('N', 'A')
                med, _, _ = ht.get_median_count(seq)

                if med < DESIRED_COVERAGE:
                    ht.consume(seq)
                    passed_filter = True

            # Emit records if any passed
            if passed_length and passed_filter:
                for record in batch:
                    if hasattr(record, 'accuracy'):
                        outfp.write('@%s\n%s\n+\n%s\n' % (record.name,
                                                          record.sequence,
                                                          record.accuracy))
                    else:
                        outfp.write(
                            '>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += batch_size

        if -1 < n:
            print \
                'DONE with', input_filename, '; kept', total - discarded, \
                'of', total, 'or', \
                int(100. - discarded / float(total) * 100.), '%'
            print 'output in', output_name
        else:
            print 'SKIPPED empty file', input_filename

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Beispiel #12
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >> sys.stderr, ''
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepkad'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.replace('N', 'A')
            kad = ht.get_kadian_count(seq)

            if kad < DESIRED_COVERAGE:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Beispiel #13
0
def main():
    parser = build_construct_args()
    parser.add_argument("output_filename")
    parser.add_argument("input_filenames", nargs="+")

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, "\nPARAMETERS:"
        print >> sys.stderr, " - kmer size =    %d \t\t(-k)" % args.ksize
        print >> sys.stderr, " - n hashes =     %d \t\t(-N)" % args.n_hashes
        print >> sys.stderr, " - min hashsize = %-5.2g \t(-x)" % args.min_hashsize
        print >> sys.stderr, ""
        print >> sys.stderr, "Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)" % (
            args.n_hashes * args.min_hashsize
        )
        print >> sys.stderr, "-" * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames

    print "Saving hashtable to %s" % base
    print "Loading kmers from sequences in %s" % repr(filenames)

    ###

    print "making hashtable"
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)

    for n, filename in enumerate(filenames):
        print "consuming input", filename
        ht.consume_fasta(filename)

        if n > 0 and n % 10 == 0:
            print "mid-save", base
            ht.save(base)
            open(base + ".info", "w").write("through %s" % filename)

    print "saving", base
    ht.save(base)

    info_fp = open(base + ".info", "w")
    info_fp.write("through end: %s\n" % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print "fp rate estimated to be %1.3f" % fp_rate
    print >> info_fp, "fp rate estimated to be %1.3f" % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        sys.exit(-1)
def main():
    parser = build_construct_args(
        "Output k-mer abundance distribution (single file version).")
    add_threading_args(parser)

    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z',
                        '--no-zero',
                        dest='output_zero',
                        default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s',
                        '--squash',
                        dest='squash_output',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savehash', dest='savehash', default='')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    datafile = args.datafile
    histout = args.histout

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    print 'building tracking ht'
    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 1 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    z_list = []

    def do_abundance_dist(r):
        z = ht.abundance_distribution_with_reads_parser(r, tracking)
        z_list.append(z)

    print 'preparing hist from %s...' % datafile
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 2 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=do_abundance_dist,
                args=(rparser,)
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    assert len(z_list) == n_threads, len(z_list)
    z = {}
    for zz in z_list:
        for i, count in enumerate(zz):
            z[i] = z.get(i, 0) + count

    total = sum(z.values())

    if 0 == total:
        print >>sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, 'w')

    sofar = 0
    for n, i in sorted(z.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
def main():
    parser = build_construct_args()
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print>>sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print>>sys.stderr, '\nPARAMETERS:'
        print>>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print>>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print>>sys.stderr, ''
        print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
        print>>sys.stderr, '-'*8


    K=args.ksize
    HT_SIZE=args.min_hashsize
    N_HT=args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###
    
    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)

    for n, filename in enumerate(filenames):
       print 'consuming input', filename
       ht.consume_fasta(filename)

       if n > 0 and n % 10 == 0:
           print 'mid-save', base
           ht.save(base)
           open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        sys.exit(-1)

    print 'DONE.'
Beispiel #16
0
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    #

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = \
                threading.Thread(
                    target=ht.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        if n > 0 and n % 10 == 0:
            print 'mid-save', base
            ht.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        sys.exit(-1)

    print 'DONE.'
Beispiel #17
0
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    # first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = \
                threading.Thread(
                    target=ht.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        if n > 0 and n % 10 == 0:
            print 'mid-save', base
            ht.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        sys.exit(-1)

    print 'DONE.'
Beispiel #19
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('-R', '--report-to-file', dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('-f', '--force-processing', dest='force',
                        help='continue on next file if read errors are \
                         encountered', action='store_true')
    parser.add_argument('-d', '--dump-frequency', dest='dump_frequency',
                        type=int, help='dump hashtable every d files',
                        default=-1)
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >>sys.stderr, \
                "** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"

        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, \
            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=args.ksize)
        print >>sys.stderr, \
            ' - n hashes =     {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes)
        print >>sys.stderr, \
            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize)
        print >>sys.stderr, ' - paired = {pr} \t\t(-p)'.format(pr=args.paired)
        print >>sys.stderr, ''
        print >>sys.stderr, \
            'Estimated memory usage is {prod:.2g} bytes \
            (n_hashes x min_hashsize)'.format(prod=args.n_hashes*args.min_hashsize)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    report_fp = args.report_file
    filenames = args.input_filenames
    force=args.force
    dump_frequency = args.dump_frequency
    
    # list to save error files along with throwing exceptions
    if force == True:
        corrupt_files = []
    
    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0

    for n, input_filename in enumerate(filenames):
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')
        
        total_acc = 0
        discarded_acc = 0
        
        try:
            total_acc, discarded_acc = normalize_by_median(input_filename, 
                                                           outfp, ht, args,
                                                           report_fp)
        except IOError as e:
            handle_error(e, output_name, input_filename, ht)
            if not force:
                print >>sys.stderr, '** Exiting!'
                sys.exit(-1)
            else:
                print >>sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
                pass
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename,
                    kept=total-discarded, total=total,
                    perc=int(100. - discarded / float(total) * 100.))
                print 'output in', output_name

        if dump_frequency > 0 and n > 0 and n % dump_frequency == 0:
            print 'Backup: Saving hashfile through', input_filename
            if args.savehash:
                hashname = args.savehash
                print '...saving to', hashname
            else:
                hashname = 'backup.ht'
                print 'Nothing given for savehash, saving to', hashname
            ht.save(hashname)
            

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)
    
    if force and len(corrupt_files) > 0:
        print >>sys.stderr, "** WARNING: Finished with errors!"
        print >>sys.stderr, "** IOErrors occurred in the following files:"
        print >>sys.stderr, "\t", " ".join(corrupt_files)
    
    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')
    
    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, trim.

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
Beispiel #21
0
def main():
    parser = build_construct_args()
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >> sys.stderr, ''
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
    ht.set_use_bigcount(True)

    for n, filename in enumerate(filenames):
        print 'consuming input', filename
        ht.consume_fasta(filename)

        if n > 0 and n % 10 == 0:
            print 'mid-save', base
            ht.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        sys.exit(-1)

    print 'DONE.'