Exemple #1
0
    ref_file_names_to_loc[base_name] = line

fid.close()

# get the locations of the reference sequences to build
out_fid = open(out_file, 'w')
ref_loc_to_build = []
for name in names_passed_thresh:
    ref_loc_to_build.append(
        ref_file_names_to_loc[name])  # if using the bz2 extension
    #ref_loc_to_build.append(ref_file_names_to_loc[os.path.splitext(name)[0]])  # since bbmap stuff changed the file extension

# This uses khmer to merge the contigs and put them in one fasta file
for loc in ref_loc_to_build:
    print(os.path.basename(loc))
    fid = khmer.ReadParser(loc)
    seq = ""
    i = 0
    for record in fid:
        if i == 0:
            header = record.name
        seq += "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"
        seq += record.sequence
        i += 1
    print("there")
    record = Record(name=header, sequence=seq)
    write_record(record, out_fid)
    fid.close()
out_fid.close()

# This relies on using bbmap to do the contig merging, and then will use cat to concatenate them
Exemple #2
0
def test_read_bundler_empty_file():
    infile = utils.get_test_data('empty-file')
    with pytest.raises(OSError):
        records = [r for r in khmer.ReadParser(infile)]
Exemple #3
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile),
                                       min_length=graph.ksize(),
                                       force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff,
                                        args.variable_coverage,
                                        args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
Exemple #4
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)
    report_on_config(args)

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print >> sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print >> sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >> sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >> sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >> sys.stderr, 'wrote to: ', outfile
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)
    report_on_config(args)

    print('making countgraph', file=sys.stderr)
    htable = khmer_args.create_countgraph(args)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print('consuming input, round 1 --', args.datafile, file=sys.stderr)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print('Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers()),
              file=sys.stderr)

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print('filtering', args.datafile, file=sys.stderr)
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print('output in', outfile, file=sys.stderr)

    if args.savetable:
        print('Saving k-mer counting table filename',
              args.savetable,
              file=sys.stderr)
        print('...saving to', args.savetable, file=sys.stderr)
        htable.save(args.savetable)
    print('wrote to: ', outfile, file=sys.stderr)
def main():

    info('load-into-counting.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.n_threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(args.n_threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print 'mid-save', base
            htable.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
Exemple #7
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_file_status(args.input_sequence_filename)
    check_space([args.input_sequence_filename])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)

    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \
            args.output_histogram_filename
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')

    print 'making k-mer counting table'
    counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                            args.n_tables, args.threads)
    counting_hash.set_use_bigcount(args.bigcount)

    print 'building k-mer tracking table'
    tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
                                  args.n_tables)

    print 'kmer_size:', counting_hash.ksize()
    print 'k-mer counting table sizes:', counting_hash.hashsizes()
    print 'outputting to', args.output_histogram_filename

    khmer.get_config().set_reads_input_buffer_size(args.threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print 'consuming input, round 1 --', args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print 'preparing hist from %s...' % args.input_sequence_filename
    rparser = khmer.ReadParser(args.input_sequence_filename, args.threads)
    threads = []
    print 'consuming input, round 2 --', args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print >> sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> hist_fp, _, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savetable:
        print 'Saving k-mer counting table ', args.savetable
        print '...saving to', args.savetable
        counting_hash.save(args.savetable)
Exemple #8
0
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    #

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = \
                threading.Thread(
                    target=ht.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        if n > 0 and n % 10 == 0:
            print 'mid-save', base
            ht.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        sys.exit(-1)

    print 'DONE.'
Exemple #9
0
def verbose_loader(filename):
    """Read iterator that additionally prints progress info to stderr."""
    for num, record in enumerate(khmer.ReadParser(filename)):
        if num % 100000 == 0:
            log_info('... filtering {num}', num=num)
        yield record
Exemple #10
0
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset',
                        '-n',
                        default=False,
                        action='store_true',
                        dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    #

    print 'making hashtable'
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    if args.no_build_tagset:
        target_method = ht.consume_fasta_with_reads_parser
    else:
        target_method = ht.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

    print 'saving hashtable in', base + '.ht'
    ht.save(base + '.ht')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        ht.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % ht.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the graph structure is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        sys.exit(-1)
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    # first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
Exemple #12
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script will create node graph for a given k-mer size and query file (can be used as input to QueryDNADatabase.py)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-fp',
                        '--fp_rate',
                        type=restricted_float,
                        help="False positive rate.",
                        default=0.0001)
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        help=
        "Location of Node Graph. Will only insert query k-mers in bloom filter if they appear anywhere in the training"
        " database. Note that the Jaccard estimates will now be "
        "J(query intersect union_i training_i, training_i) instead of J(query, training_i), "
        "but will use significantly less space (unfortunately will also disable threading)."
    )
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('in_file',
                        help="Input file: FASTQ/A file (can be gzipped).")
    parser.add_argument('out_dir', help='Output directory')

    # Parse and check args
    args = parser.parse_args()
    query_file = os.path.abspath(args.in_file)
    ksize = args.k_size
    num_threads = args.threads
    node_graph_out = os.path.join(
        os.path.abspath(args.out_dir),
        os.path.basename(query_file) + ".NodeGraph.K" + str(ksize))
    if args.intersect_nodegraph is not None:
        intersect_nodegraph_file = args.intersect_nodegraph
    else:
        intersect_nodegraph_file = None
    intersect_nodegraph = None
    if intersect_nodegraph_file is not None:
        if not os.path.exists(intersect_nodegraph_file):
            raise Exception(
                "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag."
            )
        try:
            intersect_nodegraph = khmer.load_nodegraph(
                intersect_nodegraph_file)
            if intersect_nodegraph.ksize() != ksize:
                raise Exception(
                    "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d"
                    % (intersect_nodegraph_file, intersect_nodegraph.ksize(),
                       ksize))
        except:
            raise Exception("Could not load given intersect nodegraph %s" %
                            intersect_nodegraph_file)
    fprate = args.fp_rate
    hll = khmer.HLLCounter(0.01, ksize)
    hll.consume_seqfile(query_file)
    full_kmer_count_estimate = hll.estimate_cardinality()
    res = optimal_size(full_kmer_count_estimate, fp_rate=fprate)
    if intersect_nodegraph is None:  # If no intersect list was given, just populate the bloom filter
        sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables)
        #sample_kmers.consume_seqfile(query_file)
        rparser = khmer.ReadParser(query_file)
        threads = []
        for _ in range(num_threads):
            cur_thrd = threading.Thread(
                target=sample_kmers.consume_seqfile_with_reads_parser,
                args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()
        for thread in threads:
            thread.join()
    else:  # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list
        # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training)
        #  instead of J(query, training)
        # (TODO: fix this after khmer is updated)
        #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers()  # Doesnt work due to khmer bug
        intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied(
        )  # Doesnt work due to khmer bug
        if intersect_nodegraph_kmer_count < full_kmer_count_estimate:  # At max, we have as many k-mers as in the union of the training database (But makes this always return 0)
            res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate)
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
        else:
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
        for record in screed.open(query_file):
            seq = record.sequence
            for i in range(len(seq) - ksize + 1):
                kmer = seq[i:i + ksize]
                if intersect_nodegraph.get(kmer) > 0:
                    sample_kmers.add(kmer)
    # Save the sample_kmers
    sample_kmers.save(node_graph_out)
Exemple #13
0
def main():
    info('load-graph.py', ['graph'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.)

    print 'Saving k-mer presence table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, 1)
        print 'consuming input', filename
        target_method(rparser)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if args.write_fp_rate:
        print >> info_fp, \
            '\nfalse positive rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set. Increase table size/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)
Exemple #14
0
def load_sample_seqfile(seqfiles,
                        ksize,
                        memory,
                        maxfpr=0.2,
                        count=True,
                        smallcount=False,
                        mask=None,
                        maskmaxabund=0,
                        consume_masked=False,
                        numbands=None,
                        band=None,
                        outfile=None,
                        numthreads=1):
    """Compute k-mer abundances for the specified sequence input.

    Expected input is a list of one or more FASTA/FASTQ files corresponding
    to a single sample. A sketch is created and populated with abundances
    of all k-mers observed in the input. If `mask` is provided, only k-mers not
    present in the mask will be loaded.
    """
    numtables = 4
    sketchtype = 'nodegraph'
    if count:
        sketchtype = 'smallcountgraph' if smallcount else 'countgraph'
    tablesize = memory / numtables * khmer._buckets_per_byte[sketchtype]
    sketch = allocate(ksize,
                      tablesize,
                      num_tables=numtables,
                      count=count,
                      smallcount=smallcount)
    numreads = 0
    for seqfile in seqfiles:
        message = '- processing "{}"'.format(seqfile)
        kevlar.plog('[kevlar::count]', message)
        parser = khmer.ReadParser(seqfile)
        threads = list()
        for _ in range(numthreads):
            if mask:
                threshold = 1 if consume_masked else maskmaxabund
                kwargs = {
                    'consume_masked': consume_masked,
                    'threshold': threshold
                }
                if numbands:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_banding_with_mask,
                        args=(
                            parser,
                            numbands,
                            band,
                            mask,
                        ),
                        kwargs=kwargs,
                    )
                else:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_with_mask,
                        args=(
                            parser,
                            mask,
                        ),
                        kwargs=kwargs,
                    )
            else:
                if numbands:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_banding,
                        args=(
                            parser,
                            numbands,
                            band,
                        ),
                    )
                else:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile,
                        args=(parser, ),
                    )
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()
        numreads += parser.num_reads

    message = 'Done loading k-mers'
    if numbands:
        message += ' (band {:d}/{:d})'.format(band + 1, numbands)
    fpr = kevlar.sketch.estimate_fpr(sketch)
    message += ';\n    {:d} reads processed'.format(numreads)
    message += ', {:d} distinct k-mers stored'.format(sketch.n_unique_kmers())
    message += ';\n    estimated false positive rate is {:1.3f}'.format(fpr)
    if fpr > maxfpr:
        message += ' (FPR too high, bailing out!!!)'
        message = '[kevlar::count] ' + message
        raise kevlar.sketch.KevlarUnsuitableFPRError(message)

    if outfile:
        extensions = get_extension(count=count, smallcount=smallcount)
        if not outfile.endswith(extensions):
            outfile += extensions[1]
        sketch.save(outfile)
        message += ';\n    saved to "{:s}"'.format(outfile)
    kevlar.plog('[kevlar::count]', message)

    return sketch
Exemple #15
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    args = sanitize_help(get_parser()).parse_args()
    graph_type = 'smallcountgraph' if args.small_count else 'countgraph'

    configure_logging(args.quiet)
    report_on_config(args, graph_type)

    check_input_files(args.input_sequence_filename, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, graph_type)
        check_space_for_graph(args.savegraph, graphsize, args.force)
    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        log_error('ERROR: {output} exists; not squashing.',
                  output=args.output_histogram_filename)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        hist_fp_csv = csv.writer(hist_fp)
        # write headers:
        hist_fp_csv.writerow(
            ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    log_info('making countgraph')
    # In case the user specified a maximum memory usage, use 8/(9+eps) of that
    # for the countgraph and 1/(9+eps) for the tracking nodegraph
    # `eps` is used to account for the memory used by the python interpreter
    countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3))

    log_info('building k-mer tracking graph')
    tracking = khmer_args.create_matching_nodegraph(countgraph)

    log_info('kmer_size: {ksize}', ksize=countgraph.ksize())
    log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes())
    log_info('outputting to {output}', output=args.output_histogram_filename)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 1 -- {input}',
             input=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=countgraph.consume_seqfile_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    log_info('Total number of unique k-mers: {nk}',
             nk=countgraph.n_unique_kmers())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = countgraph.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    log_info('preparing hist from {seqfile}...',
             seqfile=args.input_sequence_filename)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 2 -- {filename}',
             filename=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break

    if args.savegraph is not None:
        log_info('Saving k-mer countgraph to {savegraph}',
                 savegraph=args.savegraph)
        countgraph.save(args.savegraph)

    log_info('wrote to: {output}', output=args.output_histogram_filename)
Exemple #16
0
def load_sample_seqfile(seqfiles,
                        ksize,
                        memory,
                        maxfpr=0.2,
                        mask=None,
                        maskmaxabund=1,
                        numbands=None,
                        band=None,
                        outfile=None,
                        numthreads=1,
                        logfile=sys.stderr):
    """
    Compute k-mer abundances for the specified sequence input.

    Expected input is a list of one or more FASTA/FASTQ files corresponding
    to a single sample. A counttable is created and populated with abundances
    of all k-mers observed in the input. If `mask` is provided, only k-mers not
    present in the mask will be loaded.
    """
    message = 'loading from ' + ','.join(seqfiles)
    print('[kevlar::count]    ', message, file=logfile)

    sketch = khmer.Counttable(ksize, memory / 4, 4)
    n, nkmers = 0, 0
    for seqfile in seqfiles:
        parser = khmer.ReadParser(seqfile)
        threads = list()
        for _ in range(numthreads):
            if mask:
                if numbands:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_banding_with_mask,
                        args=(
                            parser,
                            numbands,
                            band,
                            mask,
                        ),
                    )
                else:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_with_mask,
                        args=(
                            parser,
                            mask,
                        ),
                    )
            else:
                if numbands:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_banding,
                        args=(
                            parser,
                            numbands,
                            band,
                        ),
                    )
                else:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile,
                        args=(parser, ),
                    )
            threads.append(thread)
            thread.start()

    for thread in threads:
        thread.join()

    message = 'done loading reads'
    if numbands:
        message += ' (band {:d}/{:d})'.format(band + 1, numbands)
    fpr = kevlar.sketch.estimate_fpr(sketch)
    message += ';\n    {:d} reads processed'.format(parser.num_reads)
    message += ', {:d} distinct k-mers stored'.format(sketch.n_unique_kmers())
    message += ';\n    estimated false positive rate is {:1.3f}'.format(fpr)
    if fpr > maxfpr:
        message += ' (FPR too high, bailing out!!!)'
        message = '[kevlar::count]     ' + message
        raise kevlar.sketch.KevlarUnsuitableFPRError(message)

    if outfile:
        if not outfile.endswith(('.ct', '.counttable')):
            outfile += '.counttable'
        sketch.save(outfile)
        message += ';\n    saved to "{:s}"'.format(outfile)
    print('[kevlar::count]    ', message, file=logfile)

    return sketch
            return to_return

    # Initialize the counters
    # TODO: note, I could be doing a partial dedup here, just to reduce the memory usage...
    counter = Counters()

    def map_func(sequence):
        return counter.process_seq(sequence)

    pool = multiprocessing.Pool(processes=num_threads)

    if verbose:
        print("Start streaming")
        t0 = timeit.default_timer()
    # populate the queue
    fid = khmer.ReadParser(query_file)  # This is faster than screed
    match_tuples = []
    #num_reads_per_core = 100000
    num_reads_per_chunk = num_reads_per_core * num_threads
    to_proc = [record.sequence for record in islice(fid, num_reads_per_chunk)]
    i = 0
    while to_proc:
        i += len(to_proc)
        if verbose:
            print("Read in %d sequences" % i)
        res = pool.map(map_func,
                       to_proc,
                       chunksize=max(
                           1,
                           min(num_reads_per_core,
                               len(to_proc) / num_threads)))
def sequenceToHistograma(nome, sequence):

    ksize = 3
    nkmers = 4**ksize
    tablesize = nkmers + 10

    # Initialize countgraph
    cg = khmer.Countgraph(ksize, tablesize, 1)
    # print('Created a countgraph with', cg.hashsizes(), 'buckets')

    # start loading
    # auxNome = "sequenciaAuxliar.fa"
    # aux = open(auxNome, 'w')
    # aux.write(nome)
    # aux.write(sequence+"\n")
    # aux.close()

    # fp = TemporaryFile('w+t')
    # fp = ff.TemporaryFile(mode='w+t', suffix=".fasta")

    with NamedTemporaryFile(prefix="lucas",
                            suffix=".fasta",
                            delete=False,
                            mode="w+t") as fp:
        fp.write(nome)
        fp.write(sequence)
        fp.seek(0)

    # print(fp.name)

    # print(fp.read())
    # print(fp.read())
    fp.close()
    rparser = khmer.ReadParser(fp.name)
    # fp.close()
    # os.remove(fp.name)
    # os.unlink(fp.name)

    # rparser2 = rparser
    # aux.closes
    # os.remove(auxNome)
    # rparser = khmer.ReadParser(sequence)
    threads = []
    for _ in range(1):
        thread = \
            threading.Thread(
                target=cg.consume_seqfile_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    # print('unique', cg.n_unique_kmers())
    h = Histograma(nome, cg.n_unique_kmers(), nkmers, tablesize, len(sequence))

    abundance_lists = []

    tracking = khmer_args.create_matching_nodegraph(cg)

    def __do_abundance_dist__(read_parser):
        abundances = cg.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    # with NamedTemporaryFile(prefix="lucas", suffix=".fasta", delete=False, mode="w+t") as fp:
    #     fp.write(nome)
    #     fp.write(sequence)
    #     fp.seek(0)

    # print(fp.name)

    rparser2 = khmer.ReadParser(fp.name)
    # fp.close()
    # os.remove(fp.name)
    # # os.unlink(fp.name)

    threads = []

    for _ in range(1):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser2, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == 1, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print("ERROR: abundance distribution is uniformly zero; "
              "nothing to report.")
        print("\tPlease verify that the input files are valid.")
        return 0

    sofar = 0
    line = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and line < h.tablesize:
            continue

        sofar += i
        frac = sofar / float(total)

        #hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
        # print(line, tablesize, [_, i, sofar, round(frac, 3)])
        h.histo[line][0] = _
        h.histo[line][1] = i
        h.histo[line][2] = sofar
        h.histo[line][3] = round(frac, 3)
        line = line + 1
        if sofar == total:
            break

    return h
Exemple #19
0
def main():

    info('load-into-counting.py', ['counting', 'SeqAn'])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    check_space(args.input_sequence_filename, args.force)
    check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force)

    check_file_writable(base)
    check_file_writable(base + ".info")

    print('Saving k-mer counting table to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' %
          repr(filenames), file=sys.stderr)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print('making k-mer counting table', file=sys.stderr)
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)
    htable.set_use_bigcount(args.bigcount)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        print('consuming input', filename, file=sys.stderr)
        for _ in range(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                      args.force)
            print('mid-save', base, file=sys.stderr)
            htable.save(base)
        with open(base + '.info', 'a') as info_fh:
            print('through', filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print('Total number of unique k-mers:', n_kmers, file=sys.stderr)
        with open(base + '.info', 'a') as info_fp:
            print('Total number of unique k-mers:', n_kmers, file=info_fp)

    print('saving', base, file=sys.stderr)
    htable.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2)

    with open(base + '.info', 'a') as info_fp:
        print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print("Writing summmary info to", mr_file, file=sys.stderr)
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    print('DONE.', file=sys.stderr)
    print('wrote to:', base + '.info', file=sys.stderr)
Exemple #20
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund-single.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
    tsp.start(verbose_loader(args.datafile), outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
Exemple #21
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_input_files(args.input_sequence_filename, args.force)
    check_space([args.input_sequence_filename], args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        print('ERROR: %s exists; not squashing.' %
              args.output_histogram_filename,
              file=sys.stderr)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        if args.csv:
            hist_fp_csv = csv.writer(hist_fp)
            # write headers:
            hist_fp_csv.writerow(
                ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    print('making k-mer counting table', file=sys.stderr)
    counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                            args.n_tables)
    counting_hash.set_use_bigcount(args.bigcount)

    print('building k-mer tracking table', file=sys.stderr)
    tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
                                  args.n_tables)

    print('kmer_size:', counting_hash.ksize(), file=sys.stderr)
    print('k-mer counting table sizes:',
          counting_hash.hashsizes(),
          file=sys.stderr)
    print('outputting to', args.output_histogram_filename, file=sys.stderr)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print('consuming input, round 1 --',
          args.input_sequence_filename,
          file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print('Total number of unique k-mers: {0}'.format(
            counting_hash.n_unique_kmers()),
              file=sys.stderr)

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print('preparing hist from %s...' % args.input_sequence_filename,
          file=sys.stderr)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print('consuming input, round 2 --',
          args.input_sequence_filename,
          file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print(
            "ERROR: abundance distribution is uniformly zero; "
            "nothing to report.",
            file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        if args.csv:
            hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
        else:
            print(_, i, sofar, round(frac, 3), file=hist_fp)

        if sofar == total:
            break

    if args.savetable:
        print('Saving k-mer counting table ', args.savetable, file=sys.stderr)
        print('...saving to', args.savetable, file=sys.stderr)
        counting_hash.save(args.savetable)

    print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
Exemple #22
0
def main():
    parser = build_nodegraph_args("find uniq kmer in query compard to refs")
    parser.add_argument('query',
                        help=('fasta readfile to query against'
                              'hashtable, use "-" if from stdin'))

    parser.add_argument('--x2',
                        default='1e8',
                        help='max_table size for readfile2')
    parser.add_argument('--N2',
                        default='4',
                        help='# of table (N) for readfile2')

    parser.add_argument('--bfout', help='output bloom filter of ref')

    group = parser.add_mutually_exclusive_group()
    group.add_argument('--shared',
                       dest='output',
                       action='store_const',
                       const='shared',
                       help='output shared kmers')
    group.add_argument('--uniq',
                       dest='output',
                       action='store_const',
                       const='uniq',
                       help='output uniq kmers in query')

    group2 = parser.add_mutually_exclusive_group(required=True)
    group2.add_argument(
        '--ref',
        nargs='+',
        help='fasta sequence file to be loaded in bloom filter')
    group2.add_argument('--load', help='load existing bloom filter')

    parser.set_defaults(output='uniq')
    args = parser.parse_args()
    #print(args, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables
    HT_SIZE2 = int(float(args.x2))
    N_HT2 = int(args.N2)

    # positional
    query = args.query
    output = args.output

    start_time = time.time()
    # load from existing bloom filter
    if args.load:
        print('loading bloom filter from {}..'.format(args.load),
              file=sys.stderr)
        ht = khmer.load_nodetable(args.load)
        k = ht.ksize()
        mes = ('*** incompatible ksize ({}) in {} with parameters K on '
               'command line ({})')
        assert k == K, mes.format(k, args.load, K)
        end_time = time.time()
        secs = end_time - start_time
        mes = 'load bloom filter ({}) took {:.2f} hours..'
        print(mes.format(os.path.basename(args.load), secs / 3600.0),
              file=sys.stderr)

    # create a hashbits data structure
    else:
        refs = args.ref
        print('{} refs to be loaded'.format(len(refs)), file=sys.stderr)
        if query == '-' and refs == ['-']:
            print('*** query and ref can not both be "-" (read from stdin)',
                  file=sys.stderr)
        ht = khmer.Nodetable(K, HT_SIZE, N_HT)
        end_time = time.time()
        secs = end_time - start_time
        mes = 'initiation of bloom filter took {:.2f} hours..'
        print(mes.format(secs / 3600.0), file=sys.stderr)
        for index, filename in enumerate(refs):
            if index != 0 and index % 100 == 0:
                end_time = time.time()
                secs = end_time - start_time
                mes = '{} refs have been loaded with in {:.2f} hours ..'
                print(mes.format(index, secs / 3600.0), file=sys.stderr)
            try:
                ht.consume_seqfile(filename)
            except OSError as e:
                mes = (
                    '*** Skipping due to OSError (machine or system problem):'
                    ' {}\n'
                    '*** Detailed error message:\n'
                    '*** {}')
                print(mes.format(os.path.basename(filename), str(e)),
                      file=sys.stderr)
                continue

        if args.bfout:
            if args.load:
                mes = '*** Bloom filter exists as {}, NOT saving again as {}..'
                print(mes.format(args.load, args.bfout), file=sys.stderr)
            else:
                print('*** Saving bloom filter to {}..'.format(args.bfout),
                      file=sys.stderr)
                ht.save(args.bfout)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    mes = 'fp rate estimated to be {:1.3f}'
    print(mes.format(fp_rate), file=sys.stderr)

    if fp_rate > 0.01:
        mes = ('**\n'
               '** ERROR: the counting hash is too small for\n'
               '** refs.  Increase hashsize/num ht.\n'
               '**\n'
               '** Do not use these results!!')
        sys.exit(-1)

    n_unique1 = ht.n_unique_kmers()

    # create a hashbits data structure
    ht2 = khmer.Nodetable(K, HT_SIZE2, N_HT2)

    n_unique2 = 0
    n_shared = 0

    if output == 'uniq':
        for n, record in enumerate(khmer.ReadParser(query)):
            #for n, record in enumerate(screed.open(query)):
            _l = record.name.split(None, 1)
            if len(_l) == 2:
                name, desc = _l
            else:
                name = _l[0]
                desc = ''
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]

                if (not ht2.get(kmer)):
                    n_unique2 += 1
                    if ht.get(kmer):
                        n_shared += 1
                    else:
                        mes = '>{}__{}  {}||length_{};k_{}\n{}'
                        print(mes.format(name, i, desc, seq_len, K, kmer))
                ht2.count(kmer)

    elif output == 'shared':
        for n, record in enumerate(khmer.ReadParser(query)):
            #for n, record in enumerate(screed.open(query)):
            _l = record.name.split(None, 1)
            if len(_l) == 2:
                name, desc = _l
            else:
                name = _l[0]
                desc = ''
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]

                if (not ht2.get(kmer)):
                    n_unique2 += 1
                    if ht.get(kmer):
                        n_shared += 1
                        mes = '>{}__{}  {}||length_{};k_{}\n{}'
                        print(mes.format(name, i, desc, seq_len, K, kmer))
                    else:
                        pass

                ht2.count(kmer)

    mes = ('Unique kmer in {} (query):\t{}\n'
           'Shared kmer:\t{}\n'
           'Unique kmer in {}:\t{}\n')

    print(mes.format(os.path.basename(query), n_unique2, n_shared, 'refs',
                     n_unique1),
          file=sys.stderr)
Exemple #23
0
def main():

    info('load-into-counting.py', ['counting'])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name, args.force)

    check_space(args.input_sequence_filename, args.force)
    check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force)

    print >> sys.stderr, 'Saving k-mer counting table to %s' % base
    print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)
    htable.set_use_bigcount(args.bigcount)

    filename = None

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        print >> sys.stderr, 'consuming input', filename
        for _ in xrange(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print >> sys.stderr, 'mid-save', base
            htable.save(base)
        with open(base + '.info', 'a') as info_fh:
            print >> info_fh, 'through', filename

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
        with open(base + '.info', 'a') as info_fp:
            print >> info_fp, 'Total number of unique k-mers:', n_kmers

    print >> sys.stderr, 'saving', base
    htable.save(base)

    fp_rate = khmer.calc_expected_collisions(htable)

    with open(base + '.info', 'a') as info_fp:
        print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print >> sys.stderr, "Writing summmary info to", mr_file
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.1.0",
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
                mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
                    b=os.path.basename(base),
                    fpr=fp_rate,
                    k=n_kmers,
                    fls=";".join(filenames)))

    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # Change 0.2 only if you really grok it.  HINT: You don't.
    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the k-mer counting table is too small",
        print >> sys.stderr, "for this data set. Increase tablesize/# tables."
        print >> sys.stderr, "**"
        sys.exit(1)

    print >> sys.stderr, 'DONE.'
    print >> sys.stderr, 'wrote to:', base + '.info'
Exemple #24
0
def multi_file_iter_khmer(filenames):
    for filename in filenames:
        for record in khmer.ReadParser(filename):
            yield record
Exemple #25
0
def test_read_bundler_single_read():
    infile = utils.get_test_data('single-read.fq')
    records = [r for r in khmer.ReadParser(infile)]
    bundle = khmer.utils.ReadBundle(*records)
    assert bundle.num_reads == 1
    assert bundle.reads[0].sequence == bundle.reads[0].cleaned_seq
Exemple #26
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates a CSV file of similarity indicies between the"
        " input file and each of the sketches in the training/reference file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-f',
                        '--force',
                        action="store_true",
                        help="Force creation of new NodeGraph.")
    parser.add_argument('-fp',
                        '--fp_rate',
                        type=restricted_float,
                        help="False positive rate.",
                        default=0.0001)
    parser.add_argument(
        '-ct',
        '--containment_threshold',
        type=restricted_float,
        help="Only return results with containment index above this value",
        default=0.02)
    parser.add_argument(
        '-c',
        '--confidence',
        type=restricted_float,
        help=
        "Desired probability that all results were returned with containment index above threshold [-ct]",
        default=0.95)
    parser.add_argument(
        '-ng',
        '--node_graph',
        help="NodeGraph/bloom filter location. Used if it exists; if not, one "
        "will be created and put in the same directory as the specified "
        "output CSV file.",
        default=None)
    parser.add_argument(
        '-b',
        '--base_name',
        action="store_true",
        help=
        "Flag to indicate that only the base names (not the full path) should be saved in the output CSV file"
    )
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        action="store_true",
        help=
        "Option to only insert query k-mers in bloom filter if they appear anywhere in the training"
        " database. Note that the Jaccard estimates will now be "
        "J(query intersect union_i training_i, training_i) instead of J(query, training_i), "
        "but will use significantly less space.")
    parser.add_argument('in_file',
                        help="Input file: FASTQ/A file (can be gzipped).")
    parser.add_argument(
        'training_data',
        help=
        "Training/reference data (HDF5 file created by MakeTrainingDatabase.py)"
    )
    parser.add_argument('out_csv', help='Output CSV file')

    # Parse and check args
    args = parser.parse_args()
    base_name = args.base_name
    training_data = os.path.abspath(args.training_data)
    if not os.path.exists(training_data):
        raise Exception("Training/reference file %s does not exist." %
                        training_data)
    # Let's get the k-mer sizes in the training database
    ksizes = set()
    # Import all the training data
    sketches = MH.import_multiple_from_single_hdf5(training_data)
    # Check for issues with the sketches (can also check if all the kmers make sense (i.e. no '' or non-ACTG characters))
    if sketches[0]._kmers is None:
        raise Exception(
            "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again."
        )
    num_hashes = len(sketches[0]._kmers)
    for i in range(len(sketches)):
        sketch = sketches[i]
        if sketch._kmers is None:
            raise Exception(
                "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again."
            )
        if len(sketch._kmers) != num_hashes:
            raise Exception("Unequal number of hashes for sketch of %s" %
                            sketch.input_file_name)
        ksizes.add(sketch.ksize)
        if len(ksizes) > 1:
            raise Exception(
                "Training/reference data uses different k-mer sizes. Culprit was %s."
                % (sketch.input_file_name))
    # Get the appropriate k-mer size
    ksize = ksizes.pop()
    # Get number of threads to use
    num_threads = args.threads
    # Check and parse the query file
    query_file = os.path.abspath(args.in_file)
    if not os.path.exists(query_file):
        raise Exception("Query file %s does not exist." % query_file)
    # Node graph is stored in the output folder with name <InputFASTQ/A>.NodeGraph.K<k_size>
    if args.node_graph is None:  # If no node graph is specified, create one
        node_graph_out = os.path.join(
            os.path.dirname(os.path.abspath(args.out_csv)),
            os.path.basename(query_file) + ".NodeGraph.K" + str(ksize))
        if not os.path.exists(
                node_graph_out
        ):  # Don't complain if the default location works
            print("Node graph not provided (via -ng). Creating one at: %s" %
                  node_graph_out)
    elif os.path.exists(
            args.node_graph):  # If one is specified and it exists, use it
        node_graph_out = args.node_graph
    else:  # Otherwise, the specified one doesn't exist
        raise Exception("Provided NodeGraph %s does not exist." %
                        args.node_graph)
    # import and check the intersect nodegraph
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            training_data)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None
    intersect_nodegraph = None
    if intersect_nodegraph_file is not None:
        if not os.path.exists(intersect_nodegraph_file):
            raise Exception(
                "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag."
            )
        try:
            intersect_nodegraph = khmer.load_nodegraph(
                intersect_nodegraph_file)
            if intersect_nodegraph.ksize() != ksize:
                raise Exception(
                    "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d"
                    % (intersect_nodegraph_file, intersect_nodegraph.ksize(),
                       ksize))
        except:
            raise Exception("Could not load given intersect nodegraph %s" %
                            intersect_nodegraph_file)
    results_file = os.path.abspath(args.out_csv)
    force = args.force
    fprate = args.fp_rate
    coverage_threshold = args.containment_threshold  # desired coverage cutoff
    confidence = args.confidence  # desired confidence that you got all the organisms with coverage >= desired coverage

    # Get names of training files for use as rows in returned tabular data
    training_file_names = []
    for i in range(len(sketches)):
        training_file_names.append(sketches[i].input_file_name)

    # Only form the Nodegraph if we need to
    global sample_kmers
    if not os.path.exists(node_graph_out) or force is True:
        hll = khmer.HLLCounter(0.01, ksize)
        hll.consume_seqfile(query_file)
        full_kmer_count_estimate = hll.estimate_cardinality()
        res = optimal_size(full_kmer_count_estimate, fp_rate=fprate)
        if intersect_nodegraph is None:  # If no intersect list was given, just populate the bloom filter
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
            #sample_kmers.consume_seqfile(query_file)
            rparser = khmer.ReadParser(query_file)
            threads = []
            for _ in range(num_threads):
                cur_thrd = threading.Thread(
                    target=sample_kmers.consume_seqfile_with_reads_parser,
                    args=(rparser, ))
                threads.append(cur_thrd)
                cur_thrd.start()
            for thread in threads:
                thread.join()
        else:  # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list
            # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training)
            #  instead of J(query, training)
            # (TODO: fix this after khmer is updated)
            #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers()  # Doesnt work due to khmer bug
            intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied(
            )  # Not technically correct, but I need to wait until khmer is updated
            if intersect_nodegraph_kmer_count < full_kmer_count_estimate:  # At max, we have as many k-mers as in the union of the training database (But makes this always return 0)
                res = optimal_size(intersect_nodegraph_kmer_count,
                                   fp_rate=fprate)
                sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                               res.num_htables)
            else:
                sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                               res.num_htables)
            for record in screed.open(query_file):
                seq = record.sequence
                for i in range(len(seq) - ksize + 1):
                    kmer = seq[i:i + ksize]
                    if intersect_nodegraph.get(kmer) > 0:
                        sample_kmers.add(kmer)
        # Save the sample_kmers
        sample_kmers.save(node_graph_out)
        true_fprate = khmer.calc_expected_collisions(sample_kmers,
                                                     max_false_pos=0.99)
    else:
        sample_kmers = khmer.load_nodegraph(node_graph_out)
        node_ksize = sample_kmers.ksize()
        if node_ksize != ksize:
            raise Exception(
                "Node graph %s has wrong k-mer size of %d (input was %d). Try --force or change -k."
                % (node_graph_out, node_ksize, ksize))
        true_fprate = khmer.calc_expected_collisions(sample_kmers,
                                                     max_false_pos=0.99)

    #num_sample_kmers = sample_kmers.n_unique_kmers()  # For some reason this only works when creating a new node graph, use the following instead
    num_sample_kmers = sample_kmers.n_occupied()

    # Compute all the indicies for all the training data
    pool = Pool(processes=num_threads)
    res = pool.map(
        unwrap_compute_indicies,
        zip(sketches, repeat(num_sample_kmers), repeat(true_fprate)))

    # Gather up the results in a nice form
    intersection_cardinalities = np.zeros(len(sketches))
    containment_indexes = np.zeros(len(sketches))
    jaccard_indexes = np.zeros(len(sketches))
    for i in range(len(res)):
        (intersection_cardinality, containment_index, jaccard_index) = res[i]
        intersection_cardinalities[i] = intersection_cardinality
        containment_indexes[i] = containment_index
        jaccard_indexes[i] = jaccard_index

    d = {
        'intersection': intersection_cardinalities,
        'containment index': containment_indexes,
        'jaccard index': jaccard_indexes
    }
    # Use only the basenames to label the rows (if requested)
    if base_name is True:
        df = pd.DataFrame(d, map(os.path.basename, training_file_names))
    else:
        df = pd.DataFrame(d, training_file_names)

    # Only get the rows above a certain threshold
    if coverage_threshold <= 0:
        est_threshold = 0
    else:
        est_threshold = threshold_calc(num_hashes, coverage_threshold, fprate,
                                       confidence)
    filtered_results = df[df['containment index'] > est_threshold].sort_values(
        'containment index', ascending=False)
    # Export the results
    filtered_results.to_csv(results_file, index=True, encoding='utf-8')
Exemple #27
0
def main():
    info('load-graph.py', ['graph'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames

    for _ in args.input_filenames:
        check_file_status(_, args.force)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable((float(args.n_tables * args.min_tablesize) / 8.),
                              args.force)

    print >> sys.stderr, 'Saving k-mer presence table to %s' % base
    print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print >> sys.stderr, 'We WILL NOT build the tagset.'
    else:
        print >>sys.stderr, 'We WILL build the tagset', \
                            ' (for partitioning/traversal).'

    print >> sys.stderr, 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    for _, filename in enumerate(filenames):
        rparser = khmer.ReadParser(filename)
        threads = []
        print >> sys.stderr, 'consuming input', filename
        for num in xrange(args.threads):
            cur_thread = threading.Thread(target=target_method,
                                          args=(rparser, ))
            threads.append(cur_thread)
            cur_thread.start()

        for thread in threads:
            thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    print >> sys.stderr, 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print >> sys.stderr, 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
    if args.write_fp_rate:
        print >> info_fp, \
            '\nfalse positive rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set. Increase table size/# tables.")
        print >> sys.stderr, "**"
        if not args.force:
            sys.exit(1)

    print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt'
    if not args.no_build_tagset:
        print >> sys.stderr, 'and ' + base + '.tagset'
def main():
    parser = build_construct_args(
        "Output k-mer abundance distribution (single file version).")
    add_threading_args(parser)

    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z',
                        '--no-zero',
                        dest='output_zero',
                        default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s',
                        '--squash',
                        dest='squash_output',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savehash', dest='savehash', default='')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    datafile = args.datafile
    histout = args.histout

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    print 'building tracking ht'
    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    # start loading
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 1 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    z_list = []

    def do_abundance_dist(r):
        z = ht.abundance_distribution_with_reads_parser(r, tracking)
        z_list.append(z)

    print 'preparing hist from %s...' % datafile
    rparser = khmer.ReadParser(datafile, n_threads)
    threads = []
    print 'consuming input, round 2 --', datafile
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=do_abundance_dist,
                args=(rparser,)
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    assert len(z_list) == n_threads, len(z_list)
    z = {}
    for zz in z_list:
        for i, count in enumerate(zz):
            z[i] = z.get(i, 0) + count

    total = sum(z.values())

    if 0 == total:
        print >>sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(-1)

    fp = open(histout, 'w')

    sofar = 0
    for n, i in sorted(z.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >> fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
Exemple #29
0
def main():

    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize,
                          args.force)

    info_filename = base + ".info"
    check_file_writable(base)
    check_file_writable(info_filename)

    log_info('Saving k-mer countgraph to {base}', base=base)
    log_info('Loading kmers from sequences in {filenames}',
             filenames=repr(filenames))

    # clobber the '.info' file now, as we always open in append mode below
    with open(info_filename, 'w') as info_fp:
        print('khmer version:', khmer.__version__, file=info_fp)

    log_info('making countgraph')
    countgraph = khmer_args.create_countgraph(args)
    countgraph.set_use_bigcount(args.bigcount)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        log_info('consuming input {input}', input=filename)
        for _ in range(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=countgraph.consume_seqfile_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            tablesize = calculate_graphsize(args, 'countgraph')
            check_space_for_graph(base, tablesize, args.force)
            log_info('mid-save {base}', base=base)

            countgraph.save(base)
        with open(info_filename, 'a') as info_fh:
            print('through', filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = countgraph.n_unique_kmers()
    log_info('Total number of unique k-mers: {nk}', nk=n_kmers)
    with open(info_filename, 'a') as info_fp:
        print('Total number of unique k-mers:', n_kmers, file=info_fp)

    log_info('saving {base}', base=base)
    countgraph.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(
            countgraph, args.force, max_false_pos=.2)

    with open(info_filename, 'a') as info_fp:
        print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        log_info("Writing summmary info to {mr_file}", mr_file=mr_file)
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('DONE.')
    log_info('wrote to: {filename}', filename=info_filename)
Exemple #30
0
def main():
    parser = build_nodegraph_args("find uniq kmer in query compard to refs")
    parser.add_argument('query',
                        help=('fasta readfile to query against'
                              'hashtable, use "-" if from stdin'))
    parser.add_argument('ref',
                        nargs='+',
                        help='fasta sequence file to be loaded in hashtable')
    parser.add_argument('--x2',
                        default='1e8',
                        help='max_table size for readfile2')
    parser.add_argument('--N2',
                        default='4',
                        help='# of table (N) for readfile2')

    args = parser.parse_args()
    #print(args, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables
    HT_SIZE2 = int(float(args.x2))
    N_HT2 = int(args.N2)

    # positional
    query = args.query
    refs = args.ref
    print('{} refs to be loaded'.format(len(refs)), file=sys.stderr)
    if query == '-' and refs == ['-']:
        print('*** query and ref can not both be "-" (read from stdin)',
              file=sys.stderr)
    # create a hashbits data structure
    start_time = time.time()
    ht = khmer.Nodetable(K, HT_SIZE, N_HT)
    end_time = time.time()
    secs = end_time - start_time
    mes = 'initiation of bloom filter took {:.2f} hours..'
    print(mes.format(secs / 3600.0), file=sys.stderr)
    for index, filename in enumerate(refs):
        if index != 0 and index % 100 == 0:
            end_time = time.time()
            secs = end_time - start_time
            mes = '{} refs have been loaded with in {:.2f} hours ..'
            print(mes.format(index, secs / 3600.0), file=sys.stderr)
        try:
            ht.consume_seqfile(filename)
        except OSError as e:
            mes = ('*** Skipping due to OSError (machine or system problem):'
                   ' {}\n'
                   '*** Detailed error message:\n'
                   '*** {}')
            print(mes.format(os.path.basename(filename), str(e)),
                  file=sys.stderr)
            continue

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    mes = 'fp rate estimated to be {:1.3f}'
    print(mes.format(fp_rate), file=sys.stderr)

    if fp_rate > 0.01:
        mes = ('**\n'
               '** ERROR: the counting hash is too small for\n'
               '** refs.  Increase hashsize/num ht.\n'
               '**\n'
               '** Do not use these results!!')
        sys.exit(-1)

    n_unique1 = ht.n_unique_kmers()

    pair = 0
    forward = 0
    reverse = 0
    other = 0
    total_pair = 0
    for n, is_pair, r1, r2 in broken_paired_reader(
            khmer.ReadParser(query, require_paired=True)):
        #for n, record in enumerate(screed.open(query)):
        total_pair += 1
        share_list = []
        for record in [r1, r2]:
            name, desc = record.name.split(None, 1)
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                print('*** {} is shorter than {}..'.format(r1.name, K),
                      file=sys.stderr)
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]
                if ht.get(kmer):
                    share_list.append(1)
                    break
                else:
                    share_list.append(0)

        if share_list == [1, 1]:
            pair += 1
        elif share_list == [1, 0]:
            forward += 1
        elif share_list == [0, 1]:
            reverse += 1
        else:  #[0, 0]
            other += 1
            # do not print
            continue

        mes = ('>{}  {}||uniq_{}\n{}\n' '>{}  {}||uniq_{}\n{}')
        l1 = r1.name.split(None, 1)
        l2 = r2.name.split(None, 1)
        print(
            mes.format(l1[0], l1[1], share_list[0], r1.sequence, l2[0], l2[1],
                       share_list[1], r2.sequence))

    mes = ('Unique kmer in ref:\t{}\n'
           'Total pair:\t{}\n'
           'Both primers uniq:\t{}\n'
           'Pair with forward uniq:\t{}\n'
           'Pair with reverse uniq:\t{}')

    print(mes.format(n_unique1, total_pair, pair, forward, reverse),
          file=sys.stderr)