Exemple #1
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    total_hll = khmer.HLLCounter(args.error_rate, args.ksize)

    report_fp = args.report
    input_filename = None
    for _, input_filename in enumerate(args.input_filenames):
        hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
        hllcpp.consume_seqfile(input_filename,
                               stream_records=args.stream_records)

        cardinality = hllcpp.estimate_cardinality()
        print('Estimated number of unique {0}-mers in {1}: {2}'.format(
            args.ksize, input_filename, cardinality), file=sys.stderr)

        if report_fp:
            print(cardinality, args.ksize, '(total)', file=report_fp)
            report_fp.flush()
        total_hll.merge(hllcpp)

    cardinality = total_hll.estimate_cardinality()
    print('Total estimated number of unique {0}-mers: {1}'.format(
        args.ksize, cardinality), file=sys.stderr)

    to_print = graphsize_args_report(cardinality, args.error_rate)
    if args.diagnostics:
        print(to_print, file=sys.stderr)

    if report_fp:
        print(cardinality, args.ksize, 'total', file=report_fp)
        print(to_print, file=report_fp)
        report_fp.flush()
Exemple #2
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    output_file = args.graphbase + '.pmap.merged'
    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]),
          file=sys.stderr)

    ksize = args.ksize
    nodegraph = khmer.Nodegraph(ksize, 1, 1)

    for _ in pmap_files:
        check_input_files(_, args.force)

    check_space(pmap_files, args.force)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    print('saving merged to', output_file, file=sys.stderr)
    nodegraph.save_partitionmap(output_file)

    if args.remove_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)
def main():
    info('estimate_optimal_hash.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()
    N = args.N
    if args.M:
        M = args.M
        result = optimal_size(N, M=M)
        print("number of estimated distinct k-mers:  ", N, file=sys.stderr)
        print("size of memory available to use:      ", M, file=sys.stderr)
        print("optimal number of hash tables:        ", result.num_htables,
              file=sys.stderr)
        print("optimal size of hash tables:          ", result.htable_size,
              file=sys.stderr)
        print("estimated false positive rate:        ", result.fp_rate,
              file=sys.stderr)
        print("estimated usage of memory:            ", result.mem_use,
              file=sys.stderr)

    elif args.f:
        f = args.f
        result = optimal_size(N, f=f)
        print("number of estimated distinct k-mers:  ", N, file=sys.stderr)
        print("desired maximum false positive rate:  ", f, file=sys.stderr)
        print("optimal number of hash tables:        ", result.num_htables,
              file=sys.stderr)
        print("optimal size of hash tables:          ", result.htable_size,
              file=sys.stderr)
        print("estimated false positive rate:        ", result.fp_rate,
              file=sys.stderr)
        print("estimated usage of memory:            ", result.mem_use,
              file=sys.stderr)
        
    else:
        get_parser().error('No action requested, add -M (size of memory available to use) or -f (desired maximum false posotive rate)')
Exemple #4
0
def main():
    """Main function - run when executed as a script."""
    args = sanitize_help(get_parser()).parse_args()

    statistics = []

    for filename in args.filenames:
        try:
            bps, seqs = analyze_file(filename)
        except (IOError, OSError, EOFError) as exc:
            print('ERROR in opening %s:' % filename, file=sys.stderr)
            print('     ', str(exc), file=sys.stderr)
            continue

        if seqs:
            statistics.append((bps, seqs, filename))
            avg = bps / float(seqs)
            msg = '%d bps / %d seqs; %.1f average length -- %s' % (
                bps, seqs, avg, filename)
            print('... found', msg, file=sys.stderr)
        else:
            print('No sequences found in %s' % filename, file=sys.stderr)

    if statistics:
        if args.csv:
            formatter = CsvFormatter(args.outfp)
        else:
            formatter = StdFormatter(args.outfp)
        with StatisticsOutput(formatter) as out:
            for stat in statistics:
                out.append(*stat)
    else:
        print('No sequences found in %d files' % len(args.filenames),
              file=args.outfp)
Exemple #5
0
def main():
    info("count-median.py", ["diginorm"])
    args = sanitize_help(get_parser()).parse_args()

    htfile = args.countgraph
    input_filename = args.input
    output = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_input_files(infile, args.force)

    check_space(infiles, args.force)

    print("loading k-mer countgraph from", htfile, file=sys.stderr)
    countgraph = load_countgraph(htfile)
    ksize = countgraph.ksize()
    print("writing to", output.name, file=sys.stderr)

    output = csv.writer(output)
    # write headers:
    output.writerow(["name", "median", "average", "stddev", "seqlen"])

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if "N" in seq:
            seq = seq.replace("N", "A")

        if ksize <= len(seq):
            medn, ave, stdev = countgraph.get_median_count(seq)
            ave, stdev = [round(x, 9) for x in (ave, stdev)]
            output.writerow([record.name, medn, ave, stdev, len(seq)])
def main():
    args = sanitize_help(get_parser()).parse_args()

    ksize = args.ksize
    filenames = args.input_filenames
    nodegraph = Nodegraph(ksize, 1, 1)

    partitionmap_file = args.graphbase + '.pmap.merged'

    check_input_files(partitionmap_file, args.force)
    for _ in filenames:
        check_input_files(_, args.force)

    check_space(filenames, args.force)

    print('loading partition map from:', partitionmap_file, file=sys.stderr)
    nodegraph.load_partitionmap(partitionmap_file)

    for infile in filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = nodegraph.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (part_count, infile),
              file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)
Exemple #7
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    print('fastq from ', args.input_sequence, file=sys.stderr)
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    n_count = 0
    for n, record in enumerate(screed.open(args.input_sequence)):
        if n % 10000 == 0:
            print('...', n, file=sys.stderr)

        sequence = record['sequence']

        if 'N' in sequence:
            if not args.n_keep:
                n_count += 1
                continue

        del record['quality']
        write_record(record, outfp)

    print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)

    if not args.n_keep:
        print(str(n_count) + ' lines dropped.', file=sys.stderr)

    else:
        print('No lines dropped from file.', file=sys.stderr)

    print('Wrote output to',
          describe_file_handle(args.output),
          file=sys.stderr)
Exemple #8
0
def main():
    info('merge-partitions.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    output_file = args.graphbase + '.pmap.merged'
    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]), file=sys.stderr)

    ksize = args.ksize
    nodegraph = khmer.Nodegraph(ksize, 1, 1)

    for _ in pmap_files:
        check_input_files(_, args.force)

    check_space(pmap_files, args.force)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    print('saving merged to', output_file, file=sys.stderr)
    nodegraph.save_partitionmap(output_file)

    if args.remove_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)
Exemple #9
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    htfile = args.countgraph
    input_filename = args.input
    output = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_input_files(infile, args.force)

    check_space(infiles, args.force)

    print('loading k-mer countgraph from', htfile, file=sys.stderr)
    countgraph = load_countgraph(htfile)
    ksize = countgraph.ksize()
    print('writing to', output.name, file=sys.stderr)

    output = csv.writer(output)
    # write headers:
    output.writerow(['name', 'median', 'average', 'stddev', 'seqlen'])

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'A')

        if ksize <= len(seq):
            medn, ave, stdev = countgraph.get_median_count(seq)
            ave, stdev = [round(x, 9) for x in (ave, stdev)]
            output.writerow([record.name, medn, ave, stdev, len(seq)])
Exemple #10
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    total_hll = khmer.HLLCounter(args.error_rate, args.ksize)

    report_fp = args.report
    input_filename = None
    for _, input_filename in enumerate(args.input_filenames):
        hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
        hllcpp.consume_fasta(input_filename,
                             stream_records=args.stream_records)

        cardinality = hllcpp.estimate_cardinality()
        print('Estimated number of unique {0}-mers in {1}: {2}'.format(
            args.ksize, input_filename, cardinality),
              file=sys.stderr)

        if report_fp:
            print(cardinality, args.ksize, '(total)', file=report_fp)
            report_fp.flush()
        total_hll.merge(hllcpp)

    cardinality = total_hll.estimate_cardinality()
    print('Total estimated number of unique {0}-mers: {1}'.format(
        args.ksize, cardinality),
          file=sys.stderr)

    to_print = graphsize_args_report(cardinality, args.error_rate)
    if args.diagnostics:
        print(to_print, file=sys.stderr)

    if report_fp:
        print(cardinality, args.ksize, 'total', file=report_fp)
        print(to_print, file=report_fp)
        report_fp.flush()
Exemple #11
0
def main():
    info('annotate-partitions.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    ksize = args.ksize
    filenames = args.input_filenames
    nodegraph = Nodegraph(ksize, 1, 1)

    partitionmap_file = args.graphbase + '.pmap.merged'

    check_input_files(partitionmap_file, args.force)
    for _ in filenames:
        check_input_files(_, args.force)

    check_space(filenames, args.force)

    print('loading partition map from:', partitionmap_file, file=sys.stderr)
    nodegraph.load_partitionmap(partitionmap_file)

    for infile in filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = nodegraph.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (
            part_count, infile), file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)
Exemple #12
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    print('fastq from ', args.input_sequence, file=sys.stderr)
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    n_count = 0
    for n, record in enumerate(screed.open(args.input_sequence)):
        if n % 10000 == 0:
            print('...', n, file=sys.stderr)

        sequence = record['sequence']

        if 'N' in sequence:
            if not args.n_keep:
                n_count += 1
                continue

        del record['quality']
        write_record(record, outfp)

    print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)

    if not args.n_keep:
        print(str(n_count) + ' lines dropped.', file=sys.stderr)

    else:
        print('No lines dropped from file.', file=sys.stderr)

    print('Wrote output to', describe_file_handle(args.output),
          file=sys.stderr)
Exemple #13
0
def main():
    info('filter-abund.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.input_graph, args.force)
    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    print('loading countgraph:', args.input_graph,
          file=sys.stderr)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    print("K:", ksize, file=sys.stderr)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Exemple #14
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    htfile = args.countgraph
    input_filename = args.input
    output = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_input_files(infile, args.force)

    check_space(infiles, args.force)

    print('loading k-mer countgraph from', htfile, file=sys.stderr)
    countgraph = load_countgraph(htfile)
    ksize = countgraph.ksize()
    print('writing to', output.name, file=sys.stderr)

    output = csv.writer(output)
    # write headers:
    output.writerow(['name', 'median', 'average', 'stddev', 'seqlen'])

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'A')

        if ksize <= len(seq):
            medn, ave, stdev = countgraph.get_median_count(seq)
            ave, stdev = [round(x, 9) for x in (ave, stdev)]
            output.writerow([record.name, medn, ave, stdev, len(seq)])
def main():
    args = sanitize_help(get_parser()).parse_args()
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    for filename in args.input_filenames:
        for record in screed.open(filename):
            if len(record['sequence']) >= args.length:
                write_record(record, outfp)
    print('wrote to: ' + args.output.name, file=sys.stderr)
Exemple #16
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    for filename in args.input_filenames:
        for record in screed.open(filename):
            if len(record['sequence']) >= args.length:
                write_record(record, outfp)
    print('wrote to: ' + args.output.name, file=sys.stderr)
Exemple #17
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, "countgraph")
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info("making countgraph")
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate)

    # the filtering loop
    log_info("filtering {datafile}", datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + ".abundfilt"
    else:
        outfile = args.outfile
    outfp = open(outfile, "wb")
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info("output in {outfile}", outfile=outfile)

    if args.savegraph:
        log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph)
        graph.save(args.savegraph)
Exemple #18
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
def main():
    info('interleave-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.left, args.force)
    check_input_files(args.right, args.force)
    check_space([args.left, args.right], args.force)

    s1_file = args.left
    s2_file = args.right

    fail = False

    print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)

    outfp = get_file_writer(args.output, args.gzip, args.bzip)

    counter = 0
    screed_iter_1 = screed.open(s1_file)
    screed_iter_2 = screed.open(s2_file)
    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print(("ERROR: Input files contain different number"
                   " of records."), file=sys.stderr)
            sys.exit(1)

        if counter % 100000 == 0:
            print('...', counter, 'pairs', file=sys.stderr)
        counter += 1

        name1 = read1.name
        name2 = read2.name

        if not args.no_reformat:
            if not check_is_left(name1):
                name1 += '/1'
            if not check_is_right(name2):
                name2 += '/2'

            read1.name = name1
            read2.name = name2

            if not check_is_pair(read1, read2):
                print("ERROR: This doesn't look like paired data! "
                      "%s %s" % (read1.name, read2.name), file=sys.stderr)
                sys.exit(1)

        write_record_pair(read1, read2, outfp)

    print('final: interleaved %d pairs' % counter, file=sys.stderr)
    print('output written to', describe_file_handle(outfp), file=sys.stderr)
Exemple #20
0
def main():

    info('make-initial-stoptags.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_input_files(_, args.force)

    print('loading nodegraph %s.pt' % graphbase, file=sys.stderr)
    nodegraph = khmer.load_nodegraph(graphbase)

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    nodegraph.load_tagset(graphbase + '.tagset')

    counting = khmer_args.create_countgraph(args)

    # divide up into SUBSET_SIZE fragments
    divvy = nodegraph.divide_tags_into_subsets(args.subset_size)
    divvy = list(divvy)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print('doing pre-partitioning from', start, 'to', end, file=sys.stderr)
    subset = nodegraph.do_subset_partition(start, end)

    # now, repartition...
    print('repartitioning to find HCKs.', file=sys.stderr)
    nodegraph.repartition_largest_partition(subset, counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD)

    print('saving stop tags', file=sys.stderr)
    nodegraph.save_stop_tags(graphbase + '.stoptags')
    print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
Exemple #21
0
def main():
    info('interleave-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.left, args.force)
    check_input_files(args.right, args.force)
    check_space([args.left, args.right], args.force)

    s1_file = args.left
    s2_file = args.right

    print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)

    outfp = get_file_writer(args.output, args.gzip, args.bzip)

    counter = 0
    screed_iter_1 = screed.open(s1_file)
    screed_iter_2 = screed.open(s2_file)
    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print(("ERROR: Input files contain different number"
                   " of records."),
                  file=sys.stderr)
            sys.exit(1)

        if counter % 100000 == 0:
            print('...', counter, 'pairs', file=sys.stderr)
        counter += 1

        name1 = read1.name
        name2 = read2.name

        if not args.no_reformat:
            if not check_is_left(name1):
                name1 += '/1'
            if not check_is_right(name2):
                name2 += '/2'

            read1.name = name1
            read2.name = name2

            if not check_is_pair(read1, read2):
                print("ERROR: This doesn't look like paired data! "
                      "%s %s" % (read1.name, read2.name),
                      file=sys.stderr)
                sys.exit(1)

        write_record_pair(read1, read2, outfp)

    print('final: interleaved %d pairs' % counter, file=sys.stderr)
    print('output written to', describe_file_handle(outfp), file=sys.stderr)
Exemple #22
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if args.stoptags:
        infiles.append(args.stoptags)
    for _ in infiles:
        check_input_files(_, args.force)

    print('loading nodegraph %s.pt' % graphbase, file=sys.stderr)
    nodegraph = Nodegraph.load(graphbase)

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    nodegraph.load_tagset(graphbase + '.tagset')

    counting = khmer_args.create_countgraph(args)

    # divide up into SUBSET_SIZE fragments
    divvy = nodegraph.divide_tags_into_subsets(args.subset_size)
    divvy = list(divvy)

    # pick off the first one
    if len(divvy) == 1:
        start, end = 0, 0
    else:
        start, end = divvy[:2]

    # partition!
    print('doing pre-partitioning from', start, 'to', end, file=sys.stderr)
    subset = nodegraph.do_subset_partition(start, end)

    # now, repartition...
    print('repartitioning to find HCKs.', file=sys.stderr)
    nodegraph.repartition_largest_partition(counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD,
                                            subs=subset)

    print('saving stop tags', file=sys.stderr)
    nodegraph.save_stop_tags(graphbase + '.stoptags')
    print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
Exemple #23
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
Exemple #24
0
def main():
    info('estimate_optimal_hash.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()
    N = args.N
    if args.M:
        M = args.M
        result = optimal_size(N, M=M)
        print("number of estimated distinct k-mers:  ", N, file=sys.stderr)
        print("size of memory available to use:      ", M, file=sys.stderr)
        print("optimal number of hash tables:        ",
              result.num_htables,
              file=sys.stderr)
        print("optimal size of hash tables:          ",
              result.htable_size,
              file=sys.stderr)
        print("estimated false positive rate:        ",
              result.fp_rate,
              file=sys.stderr)
        print("estimated usage of memory:            ",
              result.mem_use,
              file=sys.stderr)

    elif args.f:
        f = args.f
        result = optimal_size(N, f=f)
        print("number of estimated distinct k-mers:  ", N, file=sys.stderr)
        print("desired maximum false positive rate:  ", f, file=sys.stderr)
        print("optimal number of hash tables:        ",
              result.num_htables,
              file=sys.stderr)
        print("optimal size of hash tables:          ",
              result.htable_size,
              file=sys.stderr)
        print("estimated false positive rate:        ",
              result.fp_rate,
              file=sys.stderr)
        print("estimated usage of memory:            ",
              result.mem_use,
              file=sys.stderr)

    else:
        get_parser().error(
            'No action requested, add -M (size of memory available to use) or -f (desired maximum false posotive rate)'
        )
Exemple #25
0
def main():
    """Main function - run when executed as a script."""
    info('readstats.py')
    args = sanitize_help(get_parser()).parse_args()

    total_bp = 0
    total_seqs = 0

    statistics = []

    for filename in args.filenames:
        try:
            bps, seqs = analyze_file(filename)
        except (IOError, OSError, EOFError) as exc:
            print('ERROR in opening %s:' % filename, file=sys.stderr)
            print('     ', str(exc), file=sys.stderr)
            continue

        if seqs:
            statistics.append((bps, seqs, filename))
            avg = bps / float(seqs)
            msg = '%d bps / %d seqs; %.1f average length -- %s' % (bps,
                                                                   seqs,
                                                                   avg,
                                                                   filename)
            print('... found', msg, file=sys.stderr)
        else:
            print('No sequences found in %s' % filename, file=sys.stderr)

    if statistics:
        if args.csv:
            formatter = CsvFormatter(args.outfp)
        else:
            formatter = StdFormatter(args.outfp)
        with StatisticsOutput(formatter) as out:
            for stat in statistics:
                out.append(*stat)
    else:
        print('No sequences found in %d files' %
              len(args.filenames), file=args.outfp)
def main():
    info('filter-stoptags.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading stop tags, with K', args.ksize, file=sys.stderr)
    nodegraph = Nodegraph(args.ksize, 1, 1)
    nodegraph.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = nodegraph.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Exemple #27
0
def main():
    info('filter-stoptags.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading stop tags, with K', args.ksize, file=sys.stderr)
    nodegraph = Nodegraph(args.ksize, 1, 1)
    nodegraph.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = nodegraph.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Exemple #28
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph)
        countgraph = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
Exemple #29
0
def main():
    info('partition-graph.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()
    basename = args.basename

    filenames = [basename, basename + '.tagset']
    for _ in filenames:
        check_input_files(_, args.force)

    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    if args.stoptags:
        print('stoptag file:', args.stoptags, file=sys.stderr)
    print('--', file=sys.stderr)

    print('loading nodegraph %s' % basename, file=sys.stderr)
    nodegraph = load_nodegraph(basename)
    nodegraph.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps:',
              ' stop_big_traversals is true.', file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.', file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print('starting %d threads' % n_threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker, args=(worker_q, basename,
                                                         stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' %
          (basename,), file=sys.stderr)
Exemple #30
0
def main():

    info('collect-reads.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, False)

    check_space(args.input_sequence_filename, False)
    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize,
                              False)

    print('Saving k-mer countgraph to %s' % base)
    print('Loading sequences from %s' % repr(filenames))
    if args.output:
        print('Outputting sequences to', args.output)

    print('making countgraph', file=sys.stderr)
    htable = khmer_args.create_countgraph(args)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if 'N' in seq:
                seq = seq.replace('N', 'A')

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print('reached target average coverage:', \
                      total_coverage / float(n))
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print('...', index, filename, n, total_coverage / float(n))

        if total_coverage / float(n) > args.coverage:
            break

    print('Collected %d reads' % (n,))

    if args.report_total_kmers:
        print('Total number of k-mers: {0}'.format(
            htable.n_occupied()), file=sys.stderr)

    print('saving', base)
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable, False,
                                             max_false_pos=.2)
    print('fp rate estimated to be %1.3f' % fp_rate)
    print('fp rate estimated to be %1.3f' % fp_rate, file=info_fp)

    print('DONE.')
def main():
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    basename = os.path.basename(infile)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        infile = '-'
        if not (args.output_first and args.output_second):
            print("Accepting input from stdin; "
                  "output filenames must be provided.", file=sys.stderr)
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = os.path.join(args.output_directory, basename + '.1')
        out2 = os.path.join(args.output_directory, basename + '.2')
    else:
        out1 = basename + '.1'
        out2 = basename + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
    if args.output_second:
        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)

    # put orphaned reads here, if -0!
    if args.output_orphaned:
        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
        out0 = describe_file_handle(args.output_orphaned)

    counter1 = 0
    counter2 = 0
    counter3 = 0
    index = None

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(ReadParser(infile),
                                       require_paired=not args.output_orphaned)

    try:
        for index, is_pair, record1, record2 in paired_iter:
            if index % 10000 == 0:
                print('...', index, file=sys.stderr)

            if is_pair:
                write_record(record1, fp_out1)
                counter1 += 1
                write_record(record2, fp_out2)
                counter2 += 1
            elif args.output_orphaned:
                write_record(record1, fp_out0)
                counter3 += 1
    except UnpairedReadsError as e:
        print("Unpaired reads found starting at {name}; exiting".format(
            name=e.read1.name), file=sys.stderr)
        sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
          (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
    if args.output_orphaned:
        print("orphans in %s" % out0, file=sys.stderr)
Exemple #32
0
def main():
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        "`reservoir sampling\n"
        "<http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.",
        "reservoir sampling algorithm. "
        "http://en.wikipedia.org/wiki/Reservoir_sampling")
    args = sanitize_help(parser).parse_args()

    for name in args.filenames:
        check_input_files(name, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be

    if args.output_file:
        output_filename = args.output_file.name
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                print(
                    "NOTE: This can be overridden using the --force"
                    " argument",
                    file=sys.stderr)
                sys.exit(1)
    else:
        filename = args.filenames[0]
        if filename in ('/dev/stdin', '-'):
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)
        output_filename = os.path.basename(filename) + '.subset'

    filename = args.filenames[0]
    if filename in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        filename = '-'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads,
              file=sys.stderr)
        print('Subsampled reads will be placed in %s' % output_filename,
              file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,' %
              (args.num_reads, num_samples),
              ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N' % output_filename,
              file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for _ in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)

        for count, (_, _, rcrd1, rcrd2) in enumerate(
                broken_paired_reader(ReadParser(filename),
                                     force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' % args.max_reads,
                          '(see -M); exiting',
                          file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for sample in range(num_samples):
                    reads[sample].append((rcrd1, rcrd2))
            else:
                for sample in range(num_samples):
                    assert len(reads[sample]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' % (len(reads[0]), output_filename),
              file=sys.stderr)

        output_file = args.output_file
        if not output_file:
            output_file = open(output_filename, 'wb')

        output_file = get_file_writer(output_file, args.gzip, args.bzip)

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' % (len(reads[n]), n_filename),
                  file=sys.stderr)
            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
                                          args.bzip)
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
Exemple #33
0
def main():
    info('correct-reads.py', ['streaming'])
    args = sanitize_help(get_parser()).parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        check_space_for_graph(
            args.n_tables * args.min_tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print >>sys.stderr, 'loading k-mer countgraph from', args.loadgraph
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print >>sys.stderr, 'making k-mer countgraph'
        ct = khmer.new_countgraph(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
            % (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print >>sys.stderr, ('second pass: looking at sequences kept aside '
                             'in %s') % pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename,
                                             parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print >>sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,)
    print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,)
    print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
        (save_pass2_total, n_passes)
    print >>sys.stderr, 'removed %d reads and corrected %d reads (%.2f%%)' % \
        (n_reads - written_reads, corrected_reads, percent_reads_corrected)
    print >>sys.stderr, 'removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
            (n_reads - skipped_n, percent_reads_hicov)
        print >>sys.stderr, ('skipped %d reads/%d bases because of low'
                             'coverage') % (skipped_n, skipped_bp)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    print >>sys.stderr, 'output in *.corr'

    if args.savegraph:
        print >>sys.stderr, "Saving k-mer countgraph to", args.savegraph
        ct.save(args.savegraph)
Exemple #34
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund-single.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = graph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
    tsp.start(verbose_loader(args.datafile), outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
Exemple #35
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info('created temporary directory {temp};\n'
             'use -T to change location', temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info("... {filename} {n_saved} {n_reads} {n_bp} "
                         "{w_reads} {w_bp}", filename=filename,
                         n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
                         n_bp=trimmer.n_bp, w_reads=written_reads,
                         w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename, kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename, c=trimmer.n_saved,
                         d=trimmer.n_reads, e=trimmer.n_bp,
                         f=written_reads, g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total, np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads, t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped, p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped, bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)
Exemple #36
0
def main():

    info('load-into-counting.py', ['counting', 'SeqAn'])

    args = sanitize_help(get_parser()).parse_args()
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize,
                          args.force)

    check_file_writable(base)
    check_file_writable(base + ".info")

    print('Saving k-mer countgraph to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' % repr(filenames),
          file=sys.stderr)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print('making countgraph', file=sys.stderr)
    countgraph = khmer_args.create_countgraph(args)
    countgraph.set_use_bigcount(args.bigcount)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        print('consuming input', filename, file=sys.stderr)
        for _ in range(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=countgraph.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            tablesize = calculate_graphsize(args, 'countgraph')
            check_space_for_graph(base, tablesize, args.force)
            print('mid-save', base, file=sys.stderr)

            countgraph.save(base)
        with open(base + '.info', 'a') as info_fh:
            print('through', filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = countgraph.n_unique_kmers()
    print('Total number of unique k-mers:', n_kmers, file=sys.stderr)
    with open(base + '.info', 'a') as info_fp:
        print('Total number of unique k-mers:', n_kmers, file=info_fp)

    print('saving', base, file=sys.stderr)
    countgraph.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(
            countgraph, args.force, max_false_pos=.2)

    with open(base + '.info', 'a') as info_fp:
        print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print("Writing summmary info to", mr_file, file=sys.stderr)
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    print('DONE.', file=sys.stderr)
    print('wrote to:', base + '.info', file=sys.stderr)
Exemple #37
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile),
                                       min_length=graph.ksize(),
                                       force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff,
                                        args.variable_coverage,
                                        args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
Exemple #38
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = [args.input_count_graph_filename, args.input_sequence_filename]
    for infile in infiles:
        check_input_files(infile, False)

    log_info('Loading counting graph from {graph}',
             graph=args.input_count_graph_filename)
    countgraph = khmer.load_countgraph(args.input_count_graph_filename)

    if not countgraph.get_use_bigcount() and args.bigcount:
        log_warn("WARNING: The loaded graph has bigcount DISABLED while "
                 "bigcount reporting is ENABLED--counts higher than 255 will "
                 "not be reported.")

    countgraph.set_use_bigcount(args.bigcount)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    log_info('K: {ksize}', ksize=kmer_size)
    log_info('outputting to {output}', output=args.output_histogram_filename)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        pass
    elif os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            log_error('ERROR: {output} exists; not squashing.',
                      output=args.output_histogram_filename)
            sys.exit(1)

        log_info('** squashing existing file {output}',
                 output=args.output_histogram_filename)

    log_info('preparing hist...')
    abundances = countgraph.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        countgraph_fp = sys.stdout
    else:
        countgraph_fp = open(args.output_histogram_filename, 'w')
    countgraph_fp_csv = csv.writer(countgraph_fp)
    # write headers:
    countgraph_fp_csv.writerow(
        ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break
Exemple #39
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads,
                                        verbose=not args.quiet)
        tsp.start(verbose_loader(infile), outfp)

        log_info('output in {outfile}', outfile=outfile)
Exemple #40
0
def main():

    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('load-into-counting.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize,
                          args.force)

    info_filename = base + ".info"
    check_file_writable(base)
    check_file_writable(info_filename)

    log_info('Saving k-mer countgraph to {base}', base=base)
    log_info('Loading kmers from sequences in {filenames}',
             filenames=repr(filenames))

    # clobber the '.info' file now, as we always open in append mode below
    with open(info_filename, 'w') as info_fp:
        print('khmer version:', khmer.__version__, file=info_fp)

    log_info('making countgraph')
    countgraph = khmer_args.create_countgraph(args)
    countgraph.set_use_bigcount(args.bigcount)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        log_info('consuming input {input}', input=filename)
        for _ in range(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=countgraph.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            tablesize = calculate_graphsize(args, 'countgraph')
            check_space_for_graph(base, tablesize, args.force)
            log_info('mid-save {base}', base=base)

            countgraph.save(base)
        with open(info_filename, 'a') as info_fh:
            print('through', filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = countgraph.n_unique_kmers()
    log_info('Total number of unique k-mers: {nk}', nk=n_kmers)
    with open(info_filename, 'a') as info_fp:
        print('Total number of unique k-mers:', n_kmers, file=info_fp)

    log_info('saving {base}', base=base)
    countgraph.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(
            countgraph, args.force, max_false_pos=.2)

    with open(info_filename, 'a') as info_fp:
        print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        log_info("Writing summmary info to {mr_file}", mr_file=mr_file)
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('DONE.')
    log_info('wrote to: {filename}', filename=info_filename)
Exemple #41
0
def main():
    info('partition-graph.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()
    basename = args.basename

    filenames = [basename, basename + '.tagset']
    for _ in filenames:
        check_input_files(_, args.force)

    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    if args.stoptags:
        print('stoptag file:', args.stoptags, file=sys.stderr)
    print('--', file=sys.stderr)

    print('loading nodegraph %s' % basename, file=sys.stderr)
    nodegraph = load_nodegraph(basename)
    nodegraph.load_tagset(basename + '.tagset')

    # do we want to load stop tags, and do they exist?
    if args.stoptags:
        print('loading stoptags from', args.stoptags, file=sys.stderr)
        nodegraph.load_stop_tags(args.stoptags)

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps:',
              ' stop_big_traversals is true.',
              file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.',
              file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))

    n_threads = args.threads
    if n_subsets < n_threads:
        n_threads = n_subsets

    # start threads!
    print('starting %d threads' % n_threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(n_threads):
        cur_thrd = threading.Thread(target=worker,
                                    args=(worker_q, basename,
                                          stop_big_traversals))
        threads.append(cur_thrd)
        cur_thrd.start()

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' % (basename, ),
          file=sys.stderr)
Exemple #42
0
def main():
    info('split-paired-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    basename = os.path.basename(infile)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        if not (args.output_first and args.output_second):
            print(
                "Accepting input from stdin; "
                "output filenames must be provided.",
                file=sys.stderr)
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = os.path.join(args.output_directory, basename + '.1')
        out2 = os.path.join(args.output_directory, basename + '.2')
    else:
        out1 = basename + '.1'
        out2 = basename + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
    if args.output_second:
        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)

    # put orphaned reads here, if -0!
    if args.output_orphaned:
        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
        out0 = describe_file_handle(args.output_orphaned)

    counter1 = 0
    counter2 = 0
    counter3 = 0
    index = None

    screed_iter = screed.open(infile)

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(screed_iter,
                                       require_paired=not args.output_orphaned)

    try:
        for index, is_pair, record1, record2 in paired_iter:
            if index % 10000 == 0:
                print('...', index, file=sys.stderr)

            if is_pair:
                write_record(record1, fp_out1)
                counter1 += 1
                write_record(record2, fp_out2)
                counter2 += 1
            elif args.output_orphaned:
                write_record(record1, fp_out0)
                counter3 += 1
    except UnpairedReadsError as e:
        print("Unpaired reads found starting at {name}; exiting".format(
            name=e.read1.name),
              file=sys.stderr)
        sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
          (counter1 + counter2, counter1, counter2, counter3),
          file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
    if args.output_orphaned:
        print("orphans in %s" % out0, file=sys.stderr)
Exemple #43
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    args = sanitize_help(get_parser()).parse_args()

    report_on_config(args, graphtype='nodegraph')

    for infile in args.input_filenames:
        check_input_files(infile, args.force)

    check_space(args.input_filenames, args.force)

    print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr)
    print('Loading kmers from sequences in %s' % repr(args.input_filenames),
          file=sys.stderr)
    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    print('--', file=sys.stderr)

    # load-graph.py

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    for _, filename in enumerate(args.input_filenames):
        print('consuming input', filename, file=sys.stderr)
        nodegraph.consume_seqfile_and_tag(filename)

    # 0.18 is ACTUAL MAX. Do not change.
    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps: ',
              'stop_big_traversals is true.',
              file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.',
              file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % args.graphbase,
         'w').write('%d subsets total\n' % (n_subsets))

    if n_subsets < args.threads:
        args.threads = n_subsets

    # start threads!
    print('starting %d threads' % args.threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' % (args.graphbase, ),
          file=sys.stderr)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]),
          file=sys.stderr)

    nodegraph = khmer.Nodegraph(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    if not args.keep_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = nodegraph.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (part_count, infile),
              file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)
Exemple #44
0
def main():
    info('abundance-dist.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()

    infiles = [args.input_count_graph_filename,
               args.input_sequence_filename]
    for infile in infiles:
        check_input_files(infile, False)

    print('Counting graph from', args.input_count_graph_filename,
          file=sys.stderr)
    countgraph = khmer.load_countgraph(
        args.input_count_graph_filename)

    if not countgraph.get_use_bigcount() and args.bigcount:
        print("WARNING: The loaded graph has bigcount DISABLED while bigcount"
              " reporting is ENABLED--counts higher than 255 will not be "
              "reported.",
              file=sys.stderr)

    countgraph.set_use_bigcount(args.bigcount)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print('K:', kmer_size, file=sys.stderr)
    print('outputting to', args.output_histogram_filename, file=sys.stderr)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        pass
    elif os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            print('ERROR: %s exists; not squashing.' %
                  args.output_histogram_filename,
                  file=sys.stderr)
            sys.exit(1)

        print('** squashing existing file %s' %
              args.output_histogram_filename, file=sys.stderr)

    print('preparing hist...', file=sys.stderr)
    abundances = countgraph.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        print("ERROR: abundance distribution is uniformly zero; "
              "nothing to report.", file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        countgraph_fp = sys.stdout
    else:
        countgraph_fp = open(args.output_histogram_filename, 'w')
    countgraph_fp_csv = csv.writer(countgraph_fp)
    # write headers:
    countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative',
                                'cumulative_fraction'])

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break
Exemple #45
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    info("do-partition.py", ["graph"])
    args = sanitize_help(get_parser()).parse_args()

    report_on_config(args, graphtype="nodegraph")

    for infile in args.input_filenames:
        check_input_files(infile, args.force)

    check_space(args.input_filenames, args.force)

    print("Saving k-mer nodegraph to %s" % args.graphbase, file=sys.stderr)
    print("Loading kmers from sequences in %s" % repr(args.input_filenames), file=sys.stderr)
    print("--", file=sys.stderr)
    print("SUBSET SIZE", args.subset_size, file=sys.stderr)
    print("N THREADS", args.threads, file=sys.stderr)
    print("--", file=sys.stderr)

    # load-graph.py

    print("making nodegraph", file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    for _, filename in enumerate(args.input_filenames):
        print("consuming input", filename, file=sys.stderr)
        nodegraph.consume_fasta_and_tag(filename)

    # 0.18 is ACTUAL MAX. Do not change.
    fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15)
    print("fp rate estimated to be %1.3f" % fp_rate, file=sys.stderr)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print("** This script brakes for lumps: ", "stop_big_traversals is true.", file=sys.stderr)
    else:
        print("** Traverse all the things:", " stop_big_traversals is false.", file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print("enqueued %d subset tasks" % n_subsets, file=sys.stderr)
    open("%s.info" % args.graphbase, "w").write("%d subsets total\n" % (n_subsets))

    if n_subsets < args.threads:
        args.threads = n_subsets

    # start threads!
    print("starting %d threads" % args.threads, file=sys.stderr)
    print("---", file=sys.stderr)

    threads = []
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    assert threading.active_count() == args.threads + 1

    print("done starting threads", file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print("---", file=sys.stderr)
    print("done making subsets! see %s.subset.*.pmap" % (args.graphbase,), file=sys.stderr)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + ".subset.*.pmap")

    print("loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0]), file=sys.stderr)

    nodegraph = khmer.Nodegraph(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print("merging", pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print("removing pmap files", file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print("outputting partitions for", infile, file=sys.stderr)
        outfile = os.path.basename(infile) + ".part"
        part_count = nodegraph.output_partitions(infile, outfile)
        print("output %d partitions for %s" % (part_count, infile), file=sys.stderr)
        print("partitions are in", outfile, file=sys.stderr)
Exemple #46
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info(
        'created temporary directory {temp};\n'
        'use -T to change location',
        temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info(
                    "... {filename} {n_saved} {n_reads} {n_bp} "
                    "{w_reads} {w_bp}",
                    filename=filename,
                    n_saved=trimmer.n_saved,
                    n_reads=trimmer.n_reads,
                    n_bp=trimmer.n_bp,
                    w_reads=written_reads,
                    w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename,
                 kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename,
                         c=trimmer.n_saved,
                         d=trimmer.n_reads,
                         e=trimmer.n_bp,
                         f=written_reads,
                         g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total,
             np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads,
             t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0,
             bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped,
                 p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped,
                 bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)
Exemple #47
0
def main():

    info('collect-reads.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, False)

    check_space(args.input_sequence_filename, False)
    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize, False)

    print('Saving k-mer countgraph to %s' % base)
    print('Loading sequences from %s' % repr(filenames))
    if args.output:
        print('Outputting sequences to', args.output)

    print('making countgraph', file=sys.stderr)
    htable = khmer_args.create_countgraph(args)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if 'N' in seq:
                seq = seq.replace('N', 'A')

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print('reached target average coverage:', \
                      total_coverage / float(n))
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print('...', index, filename, n, total_coverage / float(n))

        if total_coverage / float(n) > args.coverage:
            break

    print('Collected %d reads' % (n, ))

    if args.report_total_kmers:
        print('Total number of k-mers: {0}'.format(htable.n_occupied()),
              file=sys.stderr)

    print('saving', base)
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable, False, max_false_pos=.2)
    print('fp rate estimated to be %1.3f' % fp_rate)
    print('fp rate estimated to be %1.3f' % fp_rate, file=info_fp)

    print('DONE.')
Exemple #48
0
def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    if args.max_tablesize < MAX_HSIZE:
        args.max_tablesize = MAX_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, graphtype='nodegraph')

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_input_files(args.input_fastp, args.force)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files, args.force)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = next(ix)
    del ix

    extension = 'fa'
    if hasattr(record, 'quality'):      # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(
        max_buffers, max_reads, buf_size, output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = GraphLabels.NodeGraphLabels(K, HT_SIZE, N_HT)
    try:
        print('consuming input sequences...', file=sys.stderr)
        if args.label_by_pid:
            print('...labeling by partition id (pid)', file=sys.stderr)
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print('...labeling by sequence', file=sys.stderr)
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print('...consumed {n} sequences...'.format(n=n), file=sys.stderr)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print('...labeling to create groups of size {s}'.format(
                    s=args.group_size), file=sys.stderr)
            label = -1
            g = 0
            try:
                outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                            g=g,
                                                            ext=extension
                                                            ), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open('{pref}_base_{g}.{ext}'.format(
                                pref=output_pref, g=g,
                                ext=extension), 'wb')
                    if n % 50000 == 0:
                        print('...consumed {n} sequences...'.format(n=n), file=sys.stderr)
                    ht.consume_sequence_and_tag_with_labels(record.sequence,
                                                            label)

                    write_record(record, outfp)

            except (IOError, OSError) as e:
                print('!! ERROR !!', e, file=sys.stderr)
                print('...error splitting input. exiting...', file=sys.stderr)

    except (IOError, OSError) as e:
        print('!! ERROR: !!', e, file=sys.stderr)
        print('...error consuming \
                            {i}. exiting...'.format(i=input_fastp), file=sys.stderr)

    print('done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.graph.n_tags,
                                          l=ht.n_labels))

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print('** sweeping {read_file} for labels...'.format(
            read_file=read_file), file=sys.stderr)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except (IOError, OSError) as error:
            print('!! ERROR: !!', error, file=sys.stderr)
            print('*** Could not open {fn}, skipping...'.format(
                fn=read_file), file=sys.stderr)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print('\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t), file=sys.stderr)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = list(ht.sweep_label_neighborhood(seq,
                                                              traversal_range))
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'quality'):
                        seq_str = fmt_fastq(name, seq, record.quality, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print('** End of run...', file=sys.stderr)
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print('! WARNING: Sweep finished with errors !', file=sys.stderr)
        print('** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors), file=sys.stderr)
        print('** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors), file=sys.stderr)

    print('swept {n_reads} for labels...'.format(
        n_reads=n_labeled + n_orphaned), file=sys.stderr)
    print('...with {nc} labeled and {no} orphaned'.format(
        nc=n_labeled, no=n_orphaned), file=sys.stderr)
    print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr)

    print('** outputting label number distribution...', file=sys.stderr)
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'w', encoding='utf-8') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print('** outputting label read counts...', file=sys.stderr)
    with open(fn, 'w', encoding='utf-8') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('saturate-by-median.py', ['diginorm'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    report_on_config(args)

    report_fp = args.report
    report_frequency = args.report_frequency

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, False)
    if args.savegraph:
        check_space_for_graph(args, 'countgraph', False)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadgraph:
        print('loading k-mer countgraph from', args.loadgraph)
        htable = khmer.load_countgraph(args.loadgraph)
    else:
        print('making countgraph')
        htable = create_countgraph(args)

    total = 0
    discarded = 0

    for index, input_filename in enumerate(args.input_filenames):
        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename,
                                                           htable, args,
                                                           report_fp,
                                                           report_frequency)
        except IOError as err:
            handle_error(err, input_filename)
            if not args.force:
                print("NOTE: This can be overridden using the --force"
                      " argument", file=sys.stderr)
                print('** Exiting!', file=sys.stderr)
                sys.exit(1)
            else:
                print('*** Skipping error file, moving on...', file=sys.stderr)
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print('SKIPPED empty file', input_filename)
            else:
                total += total_acc
                discarded += discarded_acc
                print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename,
                            kept=total - discarded, total=total,
                            perc=int(100. - discarded / float(total) * 100.)))

    if args.savegraph:
        print('Saving k-mer countgraph through', input_filename)
        print('...saving to', args.savegraph)
        htable.save(args.savegraph)

    # re: threshold, see Zhang et al.,
    # http://arxiv.org/abs/1309.2975
    fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate))

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** I/O Errors occurred in the following files:", file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)
Exemple #50
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile
    check_input_files(infile, args.force)
    check_space([infile], args.force)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        infile = '-'
        if not (args.output_paired and args.output_single):
            print("Accepting input from stdin; output filenames must be "
                  "provided.", file=sys.stderr)
            sys.exit(1)
    elif args.output_dir:
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        out1 = args.output_dir + '/' + os.path.basename(infile) + '.se'
        out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe'
    else:
        out1 = os.path.basename(infile) + '.se'
        out2 = os.path.basename(infile) + '.pe'

    # OVERRIDE default output file locations with -p, -s
    if args.output_paired:
        paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip)
        out2 = paired_fp.name
    else:
        # Don't override, just open the default filename from above
        paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
    if args.output_single:
        single_fp = get_file_writer(args.output_single, args.gzip, args.bzip)
        out1 = args.output_single.name
    else:
        # Don't override, just open the default filename from above
        single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)

    print('reading file "%s"' % infile, file=sys.stderr)
    print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr)
    print('outputting orphans to "%s"' % out1, file=sys.stderr)

    n_pe = 0
    n_se = 0

    screed_iter = ReadParser(infile)
    for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index > 0:
            print('...', index, file=sys.stderr)

        if is_pair:
            write_record_pair(read1, read2, paired_fp)
            n_pe += 1
        else:
            write_record(read1, single_fp)
            n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print('DONE; read %d sequences,'
          ' %d pairs and %d singletons' %
          (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr)

    print('wrote to: %s and %s' % (out2, out1),
          file=sys.stderr)
def main():  # pylint: disable=too-many-locals,too-many-branches
    args = sanitize_help(get_parser()).parse_args()
    graph_type = 'smallcountgraph' if args.small_count else 'countgraph'

    configure_logging(args.quiet)
    report_on_config(args, graph_type)

    check_input_files(args.input_sequence_filename, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, graph_type)
        check_space_for_graph(args.savegraph, graphsize, args.force)
    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        log_error('ERROR: {output} exists; not squashing.',
                  output=args.output_histogram_filename)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        hist_fp_csv = csv.writer(hist_fp)
        # write headers:
        hist_fp_csv.writerow(
            ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    log_info('making countgraph')
    # In case the user specified a maximum memory usage, use 8/(9+eps) of that
    # for the countgraph and 1/(9+eps) for the tracking nodegraph
    # `eps` is used to account for the memory used by the python interpreter
    countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3))
    countgraph.set_use_bigcount(args.bigcount)

    log_info('building k-mer tracking graph')
    tracking = khmer_args.create_matching_nodegraph(countgraph)

    log_info('kmer_size: {ksize}', ksize=countgraph.ksize())
    log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes())
    log_info('outputting to {output}', output=args.output_histogram_filename)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 1 -- {input}',
             input=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=countgraph.consume_seqfile_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    log_info('Total number of unique k-mers: {nk}',
             nk=countgraph.n_unique_kmers())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = countgraph.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    log_info('preparing hist from {seqfile}...',
             seqfile=args.input_sequence_filename)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 2 -- {filename}',
             filename=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break

    if args.savegraph is not None:
        log_info('Saving k-mer countgraph to {savegraph}',
                 savegraph=args.savegraph)
        countgraph.save(args.savegraph)

    log_info('wrote to: {output}', output=args.output_histogram_filename)
Exemple #52
0
def main():
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        ":doc:`partitioning-big-data`",
        "http://khmer.readthedocs.io/en/stable/user/"
        "partitioning-big-data.html"
    )
    args = sanitize_help(parser).parse_args()

    graphbase = args.graphbase

    # @RamRS: This might need some more work
    infiles = [graphbase, graphbase + '.tagset']
    if os.path.exists(graphbase + '.stoptags'):
        infiles.append(graphbase + '.stoptags')
    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr)
    graph = khmer.load_nodegraph(graphbase)

    print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
    graph.load_tagset(graphbase + '.tagset')

    initial_stoptags = False    # @CTB regularize with make-initial
    if os.path.exists(graphbase + '.stoptags'):
        print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr)
        graph.load_stop_tags(graphbase + '.stoptags')
        initial_stoptags = True

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]), file=sys.stderr)
    print('---', file=sys.stderr)
    print('output stoptags will be in',
          graphbase + '.stoptags', file=sys.stderr)
    if initial_stoptags:
        print(
            '(these output stoptags will include the already-loaded set)',
            file=sys.stderr)
    print('---', file=sys.stderr)

    # create countgraph
    ksize = graph.ksize()
    counting = khmer_args.create_countgraph(args, ksize=ksize)

    # load & merge
    for index, subset_file in enumerate(pmap_files):
        print('<-', subset_file, file=sys.stderr)
        subset = graph.load_subset_partitionmap(subset_file)

        print('** repartitioning subset... %s' % subset_file, file=sys.stderr)
        graph.repartition_largest_partition(subset, counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD)

        print('** merging subset... %s' % subset_file, file=sys.stderr)
        graph.merge_subset(subset)

        print('** repartitioning, round 2... %s' %
              subset_file, file=sys.stderr)
        size = graph.repartition_largest_partition(
            None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
            EXCURSION_KMER_COUNT_THRESHOLD)

        print('** repartitioned size:', size, file=sys.stderr)

        print('saving stoptags binary', file=sys.stderr)
        graph.save_stop_tags(graphbase + '.stoptags')
        os.rename(subset_file, subset_file + '.processed')
        print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr)

    print('done!', file=sys.stderr)
Exemple #53
0
def main():

    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize,
                          args.force)

    info_filename = base + ".info"
    check_file_writable(base)
    check_file_writable(info_filename)

    log_info('Saving k-mer countgraph to {base}', base=base)
    log_info('Loading kmers from sequences in {filenames}',
             filenames=repr(filenames))

    # clobber the '.info' file now, as we always open in append mode below
    with open(info_filename, 'w') as info_fp:
        print('khmer version:', khmer.__version__, file=info_fp)

    log_info('making countgraph')
    countgraph = khmer_args.create_countgraph(args)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        log_info('consuming input {input}', input=filename)
        for _ in range(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=countgraph.consume_seqfile_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            tablesize = calculate_graphsize(args, 'countgraph')
            check_space_for_graph(base, tablesize, args.force)
            log_info('mid-save {base}', base=base)

            countgraph.save(base)
        with open(info_filename, 'a') as info_fh:
            print('through', filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = countgraph.n_unique_kmers()
    log_info('Total number of unique k-mers: {nk}', nk=n_kmers)
    with open(info_filename, 'a') as info_fp:
        print('Total number of unique k-mers:', n_kmers, file=info_fp)

    log_info('saving {base}', base=base)
    countgraph.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(
            countgraph, args.force, max_false_pos=.2)

    with open(info_filename, 'a') as info_fp:
        print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        log_info("Writing summmary info to {mr_file}", mr_file=mr_file)
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('DONE.')
    log_info('wrote to: {filename}', filename=info_filename)
Exemple #54
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
Exemple #55
0
def main():
    info('sample-reads-randomly.py')
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        "`reservoir sampling\n"
        "<http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.",
        "reservoir sampling algorithm. "
        "http://en.wikipedia.org/wiki/Reservoir_sampling")
    args = sanitize_help(parser).parse_args()

    for name in args.filenames:
        check_input_files(name, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be

    if args.output_file:
        output_filename = args.output_file.name
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                print("NOTE: This can be overridden using the --force"
                      " argument", file=sys.stderr)
                sys.exit(1)
    else:
        filename = args.filenames[0]
        if filename in ('/dev/stdin', '-'):
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads, file=sys.stderr)
        print('Subsampled reads will be placed in %s' %
              output_filename, file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,'
              % (args.num_reads, num_samples), ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N'
              % output_filename, file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for _ in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)
        screed_iter = screed.open(filename)

        for count, (_, _, rcrd1, rcrd2) in enumerate(broken_paired_reader(
                screed_iter, force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' %
                          args.max_reads, '(see -M); exiting', file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for sample in range(num_samples):
                    reads[sample].append((rcrd1, rcrd2))
            else:
                for sample in range(num_samples):
                    assert len(reads[sample]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' %
              (len(reads[0]), output_filename), file=sys.stderr)

        output_file = args.output_file
        if not output_file:
            output_file = open(output_filename, 'wb')

        output_file = get_file_writer(output_file, args.gzip, args.bzip)

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' %
                  (len(reads[n]), n_filename), file=sys.stderr)
            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
                                          args.bzip)
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
Exemple #56
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    start_time = time.time()
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph1 = Countgraph.load(args.loadgraph)

    # load second counting table.
    if args.loadgraph2:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph2)
        countgraph2 = Countgraph.load(args.loadgraph2)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        screed_iter = clean_input_reads(screed.open(filename))
        reader = broken_paired_reader(screed_iter,
                                      min_length=args.ksize,
                                      force_single=force_single,
                                      require_paired=require_paired)

        # actually do diginorm
        for _, is_paired, read0, read1 in reader:
            for record in snarf(is_paired, read0, read1, countgraph1,
                                countgraph2):
                if record is not None:
                    write_record(record, outfp)

    print("--- %s seconds ---" % (time.time() - start_time))
Exemple #57
0
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    if args.loadgraph:
        print('loading countgraph from', args.loadgraph, file=sys.stderr)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print('making countgraph', file=sys.stderr)
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; '
          'use -T to change location' % tempdir, file=sys.stderr)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.output is None:
            trimfp = get_file_writer(open(os.path.basename(filename) +
                                          '.abundtrim', 'wb'),
                                     args.gzip, args.bzip)
        else:
            trimfp = get_file_writer(args.output, args.gzip, args.bzip)

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...', n, filename, save_pass2, n_reads, n_bp,
                      written_reads, written_bp, file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print('second pass: looking at sequences kept aside in %s' %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print('... x 2', n, pass2filename,
                      written_reads, written_bp, file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr)
    print('wrote %d reads, %d bp' % (written_reads, written_bp,),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' % (save_pass2_total,
                                                      n_passes),
          file=sys.stderr)
    print('removed %d reads and trimmed %d reads (%.2f%%)' %
          (n_reads - written_reads, trimmed_reads, percent_reads_trimmed),
          file=sys.stderr)
    print('trimmed or removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
                                                         percent_reads_hicov),
              file=sys.stderr)
        print('skipped %d reads/%d bases because of low coverage' %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.abundtrim', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to",
              args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)