Esempio n. 1
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, "countgraph")
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info("making countgraph")
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate)

    # the filtering loop
    log_info("filtering {datafile}", datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + ".abundfilt"
    else:
        outfile = args.outfile
    outfp = open(outfile, "wb")
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info("output in {outfile}", outfile=outfile)

    if args.savegraph:
        log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph)
        graph.save(args.savegraph)
Esempio n. 2
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
Esempio n. 3
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
Esempio n. 4
0
    def pass1(self, reader, saver):
        """
        The first pass across the read data.

        It does the following:

        1. If do_normalize is set, discard all read pairs with coverage
        above DIGINORM_COVERAGE.

        2. For each remaining read pair, check if the read pair is above
        the coverage necessary for trimming (TRIM_AT_COVERAGE).  If so,
        k-mer trim the reads at CUTOFF, and yield them.

        3. If the read pair is not at the coverage necessary for trimming,
        consume the read pair with the graph and save the read pair for the
        second pass.
        """
        graph = self.graph
        TRIM_AT_COVERAGE = self.trim_at_coverage
        CUTOFF = self.cutoff
        DIGINORM_COVERAGE = self.diginorm_coverage
        K = graph.ksize()

        for n, is_pair, read1, read2 in reader:
            bundle = ReadBundle(read1, read2)

            # clean up the sequences for examination.
            self.n_reads += bundle.num_reads
            self.n_bp += bundle.total_length

            min_coverage = min(bundle.coverages(graph))

            if self.do_normalize and min_coverage >= DIGINORM_COVERAGE:
                # skip reads if normalizing
                continue

            # trim?
            if min_coverage >= TRIM_AT_COVERAGE:
                for read in bundle.reads:
                    record, did_trim = trim_record(graph, read, CUTOFF)
                    if did_trim:
                        self.trimmed_reads += 1
                    if record:
                        yield record
            # no, too low coverage to trim; consume & set aside for 2nd pass.
            else:
                for read in bundle.reads:
                    graph.consume(read.cleaned_seq)
                    write_record(read, saver)
                    self.n_saved += 1
Esempio n. 5
0
    def pass1(self, reader, saver):
        """
        The first pass across the read data.

        It does the following:

        1. If do_normalize is set, discard all read pairs with coverage
        above DIGINORM_COVERAGE.

        2. For each remaining read pair, check if the read pair is above
        the coverage necessary for trimming (TRIM_AT_COVERAGE).  If so,
        k-mer trim the reads at CUTOFF, and yield them.

        3. If the read pair is not at the coverage necessary for trimming,
        consume the read pair with the graph and save the read pair for the
        second pass.
        """
        graph = self.graph
        TRIM_AT_COVERAGE = self.trim_at_coverage
        CUTOFF = self.cutoff
        DIGINORM_COVERAGE = self.diginorm_coverage
        K = graph.ksize()

        for n, is_pair, read1, read2 in reader:
            bundle = ReadBundle(read1, read2)

            # clean up the sequences for examination.
            self.n_reads += bundle.num_reads
            self.n_bp += bundle.total_length

            min_coverage = min(bundle.coverages(graph))

            if self.do_normalize and min_coverage >= DIGINORM_COVERAGE:
                # skip reads if normalizing
                continue

            # trim?
            if min_coverage >= TRIM_AT_COVERAGE:
                for read in bundle.reads:
                    record, did_trim = trim_record(graph, read, CUTOFF)
                    if did_trim:
                        self.trimmed_reads += 1
                    if record:
                        yield record
            # no, too low coverage to trim; consume & set aside for 2nd pass.
            else:
                for read in bundle.reads:
                    graph.consume(read.cleaned_seq)
                    write_record(read, saver)
                    self.n_saved += 1
Esempio n. 6
0
    def pass2(self, reader):
        """
        The second pass across the data does the following.

        1. For each read, evaluate the coverage. If the coverage is
        sufficient to trim, OR we are trimming low-abundance reads (-V not
        set), do trimming.

        2. Otherwise, return the untrimmed read pair.
        """
        graph = self.graph
        TRIM_AT_COVERAGE = self.trim_at_coverage
        CUTOFF = self.cutoff
        K = graph.ksize()

        for n, is_pair, read1, read2 in reader:
            bundle = ReadBundle(read1, read2)

            # clean up the sequences for examination.
            self.n_reads += bundle.num_reads
            self.n_bp += bundle.total_length

            if self.do_trim_low_abund or \
               bundle.coverages_at_least(graph, TRIM_AT_COVERAGE):

                for read in bundle.reads:
                    trimmed_record, did_trim = trim_record(graph, read, CUTOFF)

                    if did_trim:
                        self.trimmed_reads += 1
                    if trimmed_record:
                        yield trimmed_record
            else:
                for read in bundle.reads:
                    self.n_skipped += 1
                    self.bp_skipped += 1
                    yield read
Esempio n. 7
0
    def pass2(self, reader):
        """
        The second pass across the data does the following.

        1. For each read, evaluate the coverage. If the coverage is
        sufficient to trim, OR we are trimming low-abundance reads (-V not
        set), do trimming.

        2. Otherwise, return the untrimmed read pair.
        """
        graph = self.graph
        TRIM_AT_COVERAGE = self.trim_at_coverage
        CUTOFF = self.cutoff
        K = graph.ksize()

        for n, is_pair, read1, read2 in reader:
            bundle = ReadBundle(read1, read2)

            # clean up the sequences for examination.
            self.n_reads += bundle.num_reads
            self.n_bp += bundle.total_length

            if self.do_trim_low_abund or \
               bundle.coverages_at_least(graph, TRIM_AT_COVERAGE):

                for read in bundle.reads:
                    trimmed_record, did_trim = trim_record(graph, read, CUTOFF)

                    if did_trim:
                        self.trimmed_reads += 1
                    if trimmed_record:
                        yield trimmed_record
            else:
                for read in bundle.reads:
                    self.n_skipped += 1
                    self.bp_skipped += 1
                    yield read
Esempio n. 8
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile),
                                       min_length=graph.ksize(),
                                       force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff,
                                        args.variable_coverage,
                                        args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)