Exemple #1
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    total_hll = khmer.HLLCounter(args.error_rate, args.ksize)

    report_fp = args.report
    input_filename = None
    for _, input_filename in enumerate(args.input_filenames):
        hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
        hllcpp.consume_fasta(input_filename,
                             stream_records=args.stream_records)

        cardinality = hllcpp.estimate_cardinality()
        print('Estimated number of unique {0}-mers in {1}: {2}'.format(
            args.ksize, input_filename, cardinality),
              file=sys.stderr)

        if report_fp:
            print(cardinality, args.ksize, '(total)', file=report_fp)
            report_fp.flush()
        total_hll.merge(hllcpp)

    cardinality = total_hll.estimate_cardinality()
    print('Total estimated number of unique {0}-mers: {1}'.format(
        args.ksize, cardinality),
          file=sys.stderr)

    to_print = graphsize_args_report(cardinality, args.error_rate)
    if args.diagnostics:
        print(to_print, file=sys.stderr)

    if report_fp:
        print(cardinality, args.ksize, 'total', file=report_fp)
        print(to_print, file=report_fp)
        report_fp.flush()
Exemple #2
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    total_hll = khmer.HLLCounter(args.error_rate, args.ksize)

    report_fp = args.report
    input_filename = None
    for _, input_filename in enumerate(args.input_filenames):
        hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
        hllcpp.consume_seqfile(input_filename,
                               stream_records=args.stream_records)

        cardinality = hllcpp.estimate_cardinality()
        print('Estimated number of unique {0}-mers in {1}: {2}'.format(
            args.ksize, input_filename, cardinality), file=sys.stderr)

        if report_fp:
            print(cardinality, args.ksize, '(total)', file=report_fp)
            report_fp.flush()
        total_hll.merge(hllcpp)

    cardinality = total_hll.estimate_cardinality()
    print('Total estimated number of unique {0}-mers: {1}'.format(
        args.ksize, cardinality), file=sys.stderr)

    to_print = graphsize_args_report(cardinality, args.error_rate)
    if args.diagnostics:
        print(to_print, file=sys.stderr)

    if report_fp:
        print(cardinality, args.ksize, 'total', file=report_fp)
        print(to_print, file=report_fp)
        report_fp.flush()
Exemple #3
0
def main():
    info('optimal_args_nodegraph.py', ['graph', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args, graphtype='nodegraph')

    filenames = args.input_filenames
    base = filenames[0]
    for _ in args.input_filenames:
        check_input_files(_, False)

    check_space(args.input_filenames, False)

    print('Counting kmers from sequences in %s' % repr(filenames),
          file=sys.stderr)

    htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables)
    target_method = htable.consume_fasta_with_reads_parser

    for _, filename in enumerate(filenames):
        rparser = khmer.ReadParser(filename)
        threads = []
        print('consuming input', filename, file=sys.stderr)
        for num in xrange(args.threads):
            cur_thread = threading.Thread(target=target_method,
                                          args=(rparser, ))
            threads.append(cur_thread)
            cur_thread.start()

        for thread in threads:
            thread.join()
    unique_kmers = htable.n_unique_kmers()
    print('Total number of unique k-mers: {0}'.format(unique_kmers),
          file=sys.stderr)

    info_optimal = open(base + '.optimal_args', 'w')

    fp_rate = khmer.calc_expected_collisions(htable)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print("**", file=sys.stderr)
        print(
            "** ERROR: the graph structure is too small for this data set."
            "Increase table size/# tables.",
            file=sys.stderr)
        print("**", file=sys.stderr)
        if not False:
            sys.exit(1)

    to_print = graphsize_args_report(unique_kmers, fp_rate)

    print(to_print, file=info_optimal)

    print('optimal arguments were written to',
          base + '.optimal_args',
          file=sys.stderr)
def main():
    info('optimal_args_nodegraph.py', ['graph', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args, graphtype='nodegraph')


    filenames = args.input_filenames
    base = filenames[0]
    for _ in args.input_filenames:
        check_input_files(_, False)

    check_space(args.input_filenames, False)

    print('Counting kmers from sequences in %s' % repr(filenames),
          file=sys.stderr)

    htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables)
    target_method = htable.consume_fasta_with_reads_parser

    for _, filename in enumerate(filenames):
        rparser = khmer.ReadParser(filename)
        threads = []
        print('consuming input', filename, file=sys.stderr)
        for num in xrange(args.threads):
            cur_thread = threading.Thread(
                target=target_method, args=(rparser,))
            threads.append(cur_thread)
            cur_thread.start()

        for thread in threads:
            thread.join()
    unique_kmers = htable.n_unique_kmers()
    print('Total number of unique k-mers: {0}'.format(unique_kmers),
          file=sys.stderr)

    info_optimal = open(base + '.optimal_args', 'w')

    fp_rate = khmer.calc_expected_collisions(htable)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print("**", file=sys.stderr)
        print("** ERROR: the graph structure is too small for this data set."
              "Increase table size/# tables.", file=sys.stderr)
        print("**", file=sys.stderr)
        if not False:
            sys.exit(1)

    to_print = graphsize_args_report(unique_kmers, fp_rate)
    
    print(to_print, file=info_optimal)
    
    print('optimal arguments were written to', base + '.optimal_args',
          file=sys.stderr)
Exemple #5
0
def main():
    info('unique-kmers.py', ['SeqAn', 'hll'])
    args = get_parser().parse_args()

    total_hll = khmer.HLLCounter(args.error_rate, args.ksize)

    report_fp = args.report
    input_filename = None
    for index, input_filename in enumerate(args.input_filenames):
        hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
        for record in screed.open(input_filename):
            seq = record.sequence.upper().replace('N', 'A')
            hllcpp.consume_string(seq)
            if args.stream_out:
                write_record(record, sys.stdout)

        cardinality = hllcpp.estimate_cardinality()
        print('Estimated number of unique {0}-mers in {1}: {2}'.format(
              args.ksize, input_filename, cardinality),
              file=sys.stderr)

        if report_fp:
            print(cardinality, args.ksize, '(total)', file=report_fp)
            report_fp.flush()
        total_hll.merge(hllcpp)

    cardinality = total_hll.estimate_cardinality()
    print('Total estimated number of unique {0}-mers: {1}'.format(
          args.ksize, cardinality),
          file=sys.stderr)

    to_print = graphsize_args_report(cardinality, args.error_rate)
    if args.diagnostics:
        print(to_print, file=sys.stderr)

    if report_fp:
        print(cardinality, args.ksize, 'total', file=report_fp)
        print(to_print, file=report_fp)
        report_fp.flush()
Exemple #6
0
def test_output_gen():
    graphsize_args_report(99, 0.00701925498897)