Python write_recordの例、khmer.utils.write_record Pythonの例

コード例 #1

0

ファイルを表示

ファイル: normalize-by-median.py プロジェクト: le-scientifique/khmer

def normalize_by_median_and_check(input_filename, htable, single_output_file,
                                  fail_save, paired, force, norm,
                                  report_fp=None):
    total = 0
    discarded = 0

    total_acc = None
    discarded_acc = None

    if single_output_file:
        if single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = single_output_file.name
        outfp = single_output_file

    else:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

    with CatchIOErrors(input_filename, outfp, fail_save, htable, force, norm):

        for record in norm(input_filename, paired):
            write_record(record, outfp)

        total = norm.total
        discarded = norm.discarded

        if report_fp:
            print(str(total) + " " + str(total - discarded) + " " +
                  str(1. - (discarded / float(total))), file=report_fp)
            report_fp.flush()

    return norm.total, norm.discarded, norm.corrupt_files

コード例 #2

0

ファイルを表示

    def do_write(self, outfp):
        outq = self.outqueue
        while self.worker_count > 0 or not outq.empty():
            try:
                g = outq.get(True, 1)
            except queue.Empty:
                continue

            for name, seq, qual in g.seqlist:
                if qual:
                    record = screed.Record(name=name,
                                           sequence=seq,
                                           quality=qual)
                else:
                    record = screed.Record(name=name, sequence=seq)
                write_record(record, outfp)

        if self.verbose:
            print("DONE writing.\nprocessed %d / wrote %d / removed %d" %
                  (self.n_processed, self.n_written,
                   self.n_processed - self.n_written),
                  file=sys.stderr)
            print("processed %d bp / wrote %d bp / removed %d bp" %
                  (self.bp_processed, self.bp_written,
                   self.bp_processed - self.bp_written),
                  file=sys.stderr)
            discarded = self.bp_processed - self.bp_written
            f = float(discarded) / float(self.bp_processed) * 100
            print("discarded %.1f%%" % f, file=sys.stderr)

コード例 #3

0

ファイルを表示

def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print('file with ht: %s' % counting_ht)

    print('making hashtable')
    ht = Countgraph.load(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        paired_iter = broken_paired_reader(ReadParser(infile), min_length=K,
                                           force_single=True)
        for n, is_pair, read1, read2 in paired_iter:
            name = read1.name
            seq = read1.sequence
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                write_record(screed.Record(name=name, sequence=trim_seq), outfp)

コード例 #4

0

ファイルを表示

ファイル: thread_utils.py プロジェクト: adamjorr/khmer

    def do_write(self, outfp):
        outq = self.outqueue
        while self.worker_count > 0 or not outq.empty():
            try:
                grouping = outq.get(True, 1)
            except queue.Empty:
                continue

            for name, seq, qual in grouping.seqlist:
                if qual:
                    record = screed.Record(name=name, sequence=seq,
                                           quality=qual)
                else:
                    record = screed.Record(name=name, sequence=seq)
                write_record(record, outfp)

        if self.verbose:
            print("DONE writing.\nprocessed %d / wrote %d / removed %d" %
                  (self.n_processed, self.n_written,
                   self.n_processed - self.n_written), file=sys.stderr)
            print("processed %d bp / wrote %d bp / removed %d bp" %
                  (self.bp_processed, self.bp_written,
                   self.bp_processed - self.bp_written), file=sys.stderr)
            discarded = self.bp_processed - self.bp_written
            percent = float(discarded) / float(self.bp_processed) * 100
            print("discarded %.1f%%" % percent, file=sys.stderr)

コード例 #5

0

ファイルを表示

ファイル: fastq-to-fasta.py プロジェクト: nfahlgren/khmer

def main():
    args = sanitize_help(get_parser()).parse_args()

    print('fastq from ', args.input_sequence, file=sys.stderr)
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    n_count = 0
    for n, record in enumerate(screed.open(args.input_sequence)):
        if n % 10000 == 0:
            print('...', n, file=sys.stderr)

        sequence = record['sequence']

        if 'N' in sequence:
            if not args.n_keep:
                n_count += 1
                continue

        del record['quality']
        write_record(record, outfp)

    print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)

    if not args.n_keep:
        print(str(n_count) + ' lines dropped.', file=sys.stderr)

    else:
        print('No lines dropped from file.', file=sys.stderr)

    print('Wrote output to',
          describe_file_handle(args.output),
          file=sys.stderr)

コード例 #6

0

ファイルを表示

ファイル: mutate.py プロジェクト: scchess/kevlar

def main(args):
    print('[kevlar::mutate] loading mutations', file=args.logfile)
    mutations = load_mutations(kevlar.open(args.mutations, 'r'), args.logfile)

    print('[kevlar::mutate] mutating genome', file=args.logfile)
    for record in mutate_genome(args.genome, mutations):
        write_record(record, kevlar.open(args.out, 'w'))

コード例 #7

0

ファイルを表示

ファイル: fastq-to-fasta.py プロジェクト: betatim/khmer

def main():
    args = sanitize_help(get_parser()).parse_args()

    print('fastq from ', args.input_sequence, file=sys.stderr)
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    n_count = 0
    for n, record in enumerate(screed.open(args.input_sequence)):
        if n % 10000 == 0:
            print('...', n, file=sys.stderr)

        sequence = record['sequence']

        if 'N' in sequence:
            if not args.n_keep:
                n_count += 1
                continue

        del record['quality']
        write_record(record, outfp)

    print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)

    if not args.n_keep:
        print(str(n_count) + ' lines dropped.', file=sys.stderr)

    else:
        print('No lines dropped from file.', file=sys.stderr)

    print('Wrote output to', describe_file_handle(args.output),
          file=sys.stderr)

コード例 #8

0

ファイルを表示

ファイル: extract-long-sequences.py プロジェクト: betatim/khmer

def main():
    args = sanitize_help(get_parser()).parse_args()
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    for filename in args.input_filenames:
        for record in screed.open(filename):
            if len(record['sequence']) >= args.length:
                write_record(record, outfp)
    print('wrote to: ' + args.output.name, file=sys.stderr)

コード例 #9

0

ファイルを表示

ファイル: extract-long-sequences.py プロジェクト: shafcodes/khmer

def main():
    args = get_parser().parse_args()
    outfp = open(args.output, 'w')
    for filename in args.input_filenames:
        for record in screed.open(filename, parse_description=False):
            if len(record['sequence']) >= args.length:
                write_record(record, outfp)
    print >> sys.stderr, 'wrote to: ' + args.output

コード例 #10

0

ファイルを表示

def main():
    args = sanitize_help(get_parser()).parse_args()
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    for filename in args.input_filenames:
        for record in screed.open(filename):
            if len(record['sequence']) >= args.length:
                write_record(record, outfp)
    print('wrote to: ' + args.output.name, file=sys.stderr)

コード例 #11

0

ファイルを表示

ファイル: filter-abund-single.py プロジェクト: dib-lab/khmer

def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, "countgraph")
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info("making countgraph")
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate)

    # the filtering loop
    log_info("filtering {datafile}", datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + ".abundfilt"
    else:
        outfile = args.outfile
    outfp = open(outfile, "wb")
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info("output in {outfile}", outfile=outfile)

    if args.savegraph:
        log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph)
        graph.save(args.savegraph)

コード例 #12

0

ファイルを表示

def main(args):
    fastq = kevlar.open(args.out, 'w')
    refr = None
    if args.refr:
        print('[kevlar::dump] Loading reference sequence', file=args.logfile)
        refrstream = kevlar.open(args.refr, 'r')
        refr = kevlar.seqio.parse_seq_dict(refrstream)
    for read in dump(args.reads, refr, logstream=args.logfile):
        write_record(read, fastq)

コード例 #13

0

ファイルを表示

ファイル: interleave-reads.py プロジェクト: qingpeng/khmer

def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    for read1, read2 in itertools.izip(screed.open(s1_file),
                                       screed.open(s2_file)):
        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not name1.endswith('/1'):
            name1 += '/1'
        name2 = read2.name
        if not name2.endswith('/2'):
            name2 += '/2'

        assert name1[:-2] == name2[:-2], \
            "This doesn't look like paired data! %s %s" % (name1, name2)

        read1.name = name1
        read2.name = name2
        write_record(read1, args.output)
        write_record(read2, args.output)

    print >> sys.stderr, 'final: interleaved %d pairs' % counter

    print >> sys.stderr, 'output written to', args.output

コード例 #14

0

ファイルを表示

ファイル: interleave-reads.py プロジェクト: bede/khmer

def main():
    info('interleave-reads.py')
    args = get_parser().parse_args()

    for _ in args.infiles:
        check_file_status(_, args.force)

    check_space(args.infiles, args.force)

    s1_file = args.infiles[0]
    if len(args.infiles) == 2:
        s2_file = args.infiles[1]
    else:
        s2_file = s1_file.replace('_R1_', '_R2_')
        print >> sys.stderr, ("given only one file; "
                              "guessing that R2 file is %s" % s2_file)

    fail = False
    if not os.path.exists(s1_file):
        print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
        fail = True

    if not os.path.exists(s2_file):
        print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
        fail = True

    if fail and not args.force:
        sys.exit(1)

    print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)

    counter = 0
    for read1, read2 in itertools.izip(screed.open(s1_file),
                                       screed.open(s2_file)):
        if counter % 100000 == 0:
            print >> sys.stderr, '...', counter, 'pairs'
        counter += 1

        name1 = read1.name
        if not name1.endswith('/1'):
            name1 += '/1'
        name2 = read2.name
        if not name2.endswith('/2'):
            name2 += '/2'

        assert name1[:-2] == name2[:-2], \
            "This doesn't look like paired data! %s %s" % (name1, name2)

        read1.name = name1
        read2.name = name2
        write_record(read1, args.output)
        write_record(read2, args.output)

    print >> sys.stderr, 'final: interleaved %d pairs' % counter

    print >> sys.stderr, 'output written to', args.output

コード例 #15

0

ファイルを表示

def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)

コード例 #16

0

ファイルを表示

ファイル: trim-low-abund.py プロジェクト: wslam/khmer

    def pass1(self, reader, saver):
        """
        The first pass across the read data.

        It does the following:

        1. If do_normalize is set, discard all read pairs with coverage
        above DIGINORM_COVERAGE.

        2. For each remaining read pair, check if the read pair is above
        the coverage necessary for trimming (TRIM_AT_COVERAGE).  If so,
        k-mer trim the reads at CUTOFF, and yield them.

        3. If the read pair is not at the coverage necessary for trimming,
        consume the read pair with the graph and save the read pair for the
        second pass.
        """
        graph = self.graph
        TRIM_AT_COVERAGE = self.trim_at_coverage
        CUTOFF = self.cutoff
        DIGINORM_COVERAGE = self.diginorm_coverage
        K = graph.ksize()

        for n, is_pair, read1, read2 in reader:
            bundle = ReadBundle(read1, read2)

            # clean up the sequences for examination.
            self.n_reads += bundle.n_reads
            self.n_bp += bundle.n_bp

            min_coverage = min(bundle.coverages(graph))

            if self.do_normalize and min_coverage >= DIGINORM_COVERAGE:
                # skip reads if normalizing
                continue

            # trim?
            if min_coverage >= TRIM_AT_COVERAGE:
                for read, cleaned_read in bundle.both():
                    record, did_trim = do_trim_read(graph, read, cleaned_read,
                                                    CUTOFF)
                    if did_trim:
                        self.trimmed_reads += 1
                    if record:
                        yield record
            # no, too low coverage to trim; consume & set aside for 2nd pass.
            else:
                for read, cleaned_read in bundle.both():
                    graph.consume(cleaned_read)
                    write_record(read, saver)
                    self.n_saved += 1

コード例 #17

0

ファイルを表示

ファイル: trim-low-abund.py プロジェクト: costypetrisor/khmer

    def pass1(self, reader, saver):
        """
        The first pass across the read data.

        It does the following:

        1. If do_normalize is set, discard all read pairs with coverage
        above DIGINORM_COVERAGE.

        2. For each remaining read pair, check if the read pair is above
        the coverage necessary for trimming (TRIM_AT_COVERAGE).  If so,
        k-mer trim the reads at CUTOFF, and yield them.

        3. If the read pair is not at the coverage necessary for trimming,
        consume the read pair with the graph and save the read pair for the
        second pass.
        """
        graph = self.graph
        TRIM_AT_COVERAGE = self.trim_at_coverage
        CUTOFF = self.cutoff
        DIGINORM_COVERAGE = self.diginorm_coverage
        K = graph.ksize()

        for n, is_pair, read1, read2 in reader:
            bundle = ReadBundle(read1, read2)

            # clean up the sequences for examination.
            self.n_reads += bundle.num_reads
            self.n_bp += bundle.total_length

            min_coverage = min(bundle.coverages(graph))

            if self.do_normalize and min_coverage >= DIGINORM_COVERAGE:
                # skip reads if normalizing
                continue

            # trim?
            if min_coverage >= TRIM_AT_COVERAGE:
                for read in bundle.reads:
                    record, did_trim = trim_record(graph, read, CUTOFF)
                    if did_trim:
                        self.trimmed_reads += 1
                    if record:
                        yield record
            # no, too low coverage to trim; consume & set aside for 2nd pass.
            else:
                for read in bundle.reads:
                    graph.consume(read.cleaned_seq)
                    write_record(read, saver)
                    self.n_saved += 1

コード例 #18

0

ファイルを表示

ファイル: filter-abund.py プロジェクト: costypetrisor/khmer

def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)

コード例 #19

0

ファイルを表示

ファイル: extract-partitions.py プロジェクト: wltrimbl/khmer

    def process_unassigned(self, outfp=None):
        """
        Process unassigned reads.

        Can optionally output said reads if outfp is given
        Also develops counts of partition IDs--necessary for further processing
        """
        with PartitionedReader(self.file_list) as reader:
            for read, pid in reader:
                self.count[pid] = self.count.get(pid, 0) + 1

                if pid == 0:
                    self.n_unassigned += 1
                    if outfp:
                        write_record(read, outfp)

コード例 #20

0

ファイルを表示

ファイル: extract-partitions.py プロジェクト: wslam/khmer

    def process_unassigned(self, outfp=None):
        """
        Process unassigned reads.

        Can optionally output said reads if outfp is given
        Also develops counts of partition IDs--necessary for further processing
        """
        with PartitionedReader(self.file_list) as reader:
            for read, pid in reader:
                self.count[pid] = self.count.get(pid, 0) + 1

                if pid == 0:
                    self.n_unassigned += 1
                    if outfp:
                        write_record(read, outfp)

コード例 #21

0

ファイルを表示

ファイル: split-paired-reads.py プロジェクト: qingpeng/khmer

def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_file_status(infile, args.force)
    filenames = [infile]
    check_space(filenames, args.force)

    if args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE defaults with -1, -2
    if args.output_first:
        out1 = args.output_first
    if args.output_second:
        out2 = args.output_second

    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None
    for index, record in enumerate(screed.open(infile)):
        if index % 100000 == 0 and index:
            print >> sys.stderr, '...', index

        name = record.name
        if name.endswith('/1'):
            write_record(record, fp_out1)
            counter1 += 1
        elif name.endswith('/2'):
            write_record(record, fp_out2)
            counter2 += 1

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (index + 1, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2

コード例 #22

0

ファイルを表示

ファイル: extract-paired-reads.py プロジェクト: PeterDaveHello/khmer

def main():
    info('extract-paired-reads.py')
    args = get_parser().parse_args()

    check_input_files(args.infile, args.force)
    infiles = [args.infile]
    check_space(infiles, args.force)

    outfile = os.path.basename(args.infile)
    if len(sys.argv) > 2:
        outfile = sys.argv[2]

    single_fp = open(outfile + '.se', 'w')
    paired_fp = open(outfile + '.pe', 'w')

    print >>sys.stderr, 'reading file "%s"' % args.infile
    print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile
    print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile

    n_pe = 0
    n_se = 0

    screed_iter = screed.open(args.infile, parse_description=False)
    for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index > 0:
            print >>sys.stderr, '...', index

        if is_pair:
            write_record_pair(read1, read2, paired_fp)
            n_pe += 1
        else:
            write_record(read1, single_fp)
            n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print >>sys.stderr, 'DONE; read %d sequences,' \
        ' %d pairs and %d singletons' % \
        (n_pe * 2 + n_se, n_pe, n_se)

    print >> sys.stderr, 'wrote to: ' + outfile \
        + '.se' + ' and ' + outfile + '.pe'

コード例 #23

0

ファイルを表示

ファイル: extract-paired-reads.py プロジェクト: shafcodes/khmer

def main():
    info('extract-paired-reads.py')
    args = get_parser().parse_args()

    check_file_status(args.infile, args.force)
    infiles = [args.infile]
    check_space(infiles, args.force)

    outfile = os.path.basename(args.infile)
    if len(sys.argv) > 2:
        outfile = sys.argv[2]

    single_fp = open(outfile + '.se', 'w')
    paired_fp = open(outfile + '.pe', 'w')

    print >> sys.stderr, 'reading file "%s"' % args.infile
    print >> sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile
    print >> sys.stderr, 'outputting orphans to "%s.se"' % outfile

    n_pe = 0
    n_se = 0

    screed_iter = screed.open(args.infile, parse_description=False)
    for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index > 0:
            print >> sys.stderr, '...', index

        if is_pair:
            write_record_pair(read1, read2, paired_fp)
            n_pe += 1
        else:
            write_record(read1, single_fp)
            n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print >>sys.stderr, 'DONE; read %d sequences,' \
        ' %d pairs and %d singletons' % \
        (n_pe * 2 + n_se, n_pe, n_se)

    print >> sys.stderr, 'wrote to: ' + outfile \
        + '.se' + ' and ' + outfile + '.pe'

コード例 #24

0

ファイルを表示

def main():
    parser = build_nodegraph_args()
    parser.add_argument('-o',
                        '--outfile',
                        help='output file; default is "infile".sweep2')
    parser.add_argument('-q', '--quiet')
    parser.add_argument('input_filename')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    inp = args.input_filename
    readsfile = args.read_filename

    outfile = os.path.basename(readsfile) + '.sweep2'
    if args.outfile:
        outfile = args.outfile
    outfp = open(outfile, 'w')

    # create a nodegraph data structure
    ht = khmer_args.create_countgraph(args)

    # load contigs, connect into N partitions
    print('loading input reads from', inp)
    ht.consume_seqfile(inp)

    print('starting sweep.')

    m = 0
    K = ht.ksize()
    instream = screed.open(readsfile)
    for n, is_pair, read1, read2 in broken_paired_reader(instream):
        if n % 10000 == 0:
            print('...', n, m)

        if is_pair:
            count1 = ht.get_median_count(read1.sequence)[0]
            count2 = ht.get_median_count(read2.sequence)[0]
            if count1 or count2:
                m += 1
                write_record_pair(read1, read2, outfp)
        else:
            count = ht.get_median_count(read1.sequence)[0]
            if count:
                m += 1
                write_record(read1, outfp)

コード例 #25

0

ファイルを表示

ファイル: sweep-reads2.py プロジェクト: costypetrisor/khmer

def main():
    parser = build_nodegraph_args()
    parser.add_argument('-o', '--outfile',
                        help='output file; default is "infile".sweep2')
    parser.add_argument('-q', '--quiet')
    parser.add_argument('input_filename')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    inp = args.input_filename
    readsfile = args.read_filename

    outfile = os.path.basename(readsfile) + '.sweep2'
    if args.outfile:
        outfile = args.outfile
    outfp = open(outfile, 'w')

    # create a nodegraph data structure
    ht = khmer_args.create_countgraph(args)

    # load contigs, connect into N partitions
    print('loading input reads from', inp)
    ht.consume_fasta(inp)

    print('starting sweep.')

    m = 0
    K = ht.ksize()
    instream = screed.open(readsfile)
    for n, is_pair, read1, read2 in broken_paired_reader(instream):
        if n % 10000 == 0:
            print('...', n, m)

        if is_pair:
            count1 = ht.get_median_count(read1.sequence)[0]
            count2 = ht.get_median_count(read2.sequence)[0]
            if count1 or count2:
                m += 1
                write_record_pair(read1, read2, outfp)
        else:
            count = ht.get_median_count(read1.sequence)[0]
            if count:
                m += 1
                write_record(read1, outfp)

コード例 #26

0

ファイルを表示

ファイル: unique-kmers.py プロジェクト: F1000Research/khmer

def main():
    info('unique-kmers.py', ['SeqAn', 'hll'])
    args = get_parser().parse_args()

    total_hll = khmer.HLLCounter(args.error_rate, args.ksize)

    report_fp = args.report
    input_filename = None
    for index, input_filename in enumerate(args.input_filenames):
        hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
        for record in screed.open(input_filename):
            seq = record.sequence.upper().replace('N', 'A')
            hllcpp.consume_string(seq)
            if args.stream_out:
                write_record(record, sys.stdout)

        cardinality = hllcpp.estimate_cardinality()
        print('Estimated number of unique {0}-mers in {1}: {2}'.format(
              args.ksize, input_filename, cardinality),
              file=sys.stderr)

        if report_fp:
            print(cardinality, args.ksize, '(total)', file=report_fp)
            report_fp.flush()
        total_hll.merge(hllcpp)

    cardinality = total_hll.estimate_cardinality()
    print('Total estimated number of unique {0}-mers: {1}'.format(
          args.ksize, cardinality),
          file=sys.stderr)

    to_print = graphsize_args_report(cardinality, args.error_rate)
    if args.diagnostics:
        print(to_print, file=sys.stderr)

    if report_fp:
        print(cardinality, args.ksize, 'total', file=report_fp)
        print(to_print, file=report_fp)
        report_fp.flush()

コード例 #27

0

ファイルを表示

ファイル: correct-reads.py プロジェクト: zhaijj/khmer

def main():
    info('correct-reads.py', ['streaming'])
    args = sanitize_help(get_parser()).parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    tablesize = calculate_graphsize(args, 'countgraph')

    if args.savegraph:
        check_space_for_graph(args.savegraph, tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr)
        ct = Countgraph.load(args.loadgraph)
    else:
        print('making k-mer countgraph', file=sys.stderr)
        ct = create_countgraph(args, multiplier=8 / (9. + 0.3))
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; use -T to change location' %
          tempdir,
          file=sys.stderr)

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...',
                      n,
                      filename,
                      save_pass2,
                      n_reads,
                      n_bp,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print(('second pass: looking at sequences kept aside in %s') %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(
                screed.open(pass2filename, parse_description=False)):
            if n % 10000 == 0:
                print('... x 2',
                      n,
                      pass2filename,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (
        n_reads,
        n_bp,
    ), file=sys.stderr)
    print('wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    ),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' %
          (save_pass2_total, n_passes),
          file=sys.stderr)
    print('removed %d reads and corrected %d reads (%.2f%%)' %
          (n_reads - written_reads, corrected_reads, percent_reads_corrected),
          file=sys.stderr)
    print('removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' %
              (n_reads - skipped_n, percent_reads_hicov),
              file=sys.stderr)
        print(('skipped %d reads/%d bases because of low coverage') %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.corr', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)

コード例 #28

0

ファイルを表示

ファイル: split-paired-reads.py プロジェクト: costypetrisor/khmer

def main():
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    basename = os.path.basename(infile)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        infile = '-'
        if not (args.output_first and args.output_second):
            print("Accepting input from stdin; "
                  "output filenames must be provided.", file=sys.stderr)
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = os.path.join(args.output_directory, basename + '.1')
        out2 = os.path.join(args.output_directory, basename + '.2')
    else:
        out1 = basename + '.1'
        out2 = basename + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
    if args.output_second:
        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)

    # put orphaned reads here, if -0!
    if args.output_orphaned:
        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
        out0 = describe_file_handle(args.output_orphaned)

    counter1 = 0
    counter2 = 0
    counter3 = 0
    index = None

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(ReadParser(infile),
                                       require_paired=not args.output_orphaned)

    try:
        for index, is_pair, record1, record2 in paired_iter:
            if index % 10000 == 0:
                print('...', index, file=sys.stderr)

            if is_pair:
                write_record(record1, fp_out1)
                counter1 += 1
                write_record(record2, fp_out2)
                counter2 += 1
            elif args.output_orphaned:
                write_record(record1, fp_out0)
                counter3 += 1
    except UnpairedReadsError as e:
        print("Unpaired reads found starting at {name}; exiting".format(
            name=e.read1.name), file=sys.stderr)
        sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
          (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
    if args.output_orphaned:
        print("orphans in %s" % out0, file=sys.stderr)

コード例 #29

0

ファイルを表示

ファイル: trim-low-abund.py プロジェクト: MontrealSergiy/khmer

def main():
    info('trim-low-abund.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadtable:
        print >>sys.stderr, 'loading k-mer counting table from', args.loadtable
        ct = khmer.load_counting_hash(args.loadtable)
    else:
        print >>sys.stderr, 'making k-mer counting table'
        ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        trimfilename = os.path.basename(filename) + '.abundtrim'

        pass2list.append((filename, pass2filename, trimfilename))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')
        trimfp = open(trimfilename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()
        trimfp.close()

        print '%s: kept aside %d of %d from first pass, in %s' % \
              (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfilename in pass2list:
        print 'second pass: looking at sequences kept aside in %s' % \
              pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        trimfp = open(trimfilename, 'a')
        for n, read in enumerate(screed.open(pass2filename,
                                             parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print >>sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print 'read %d reads, %d bp' % (n_reads, n_bp,)
    print 'wrote %d reads, %d bp' % (written_reads, written_bp,)
    print 'looked at %d reads twice (%.2f passes)' % (save_pass2_total,
                                                      n_passes)
    print 'removed %d reads and trimmed %d reads (%.2f%%)' % \
        (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)
    print 'trimmed or removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print '%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
                                                         percent_reads_hicov)
        print 'skipped %d reads/%d bases because of low coverage' % \
              (skipped_n, skipped_bp)

    fp_rate = khmer.calc_expected_collisions(ct)
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(1)

    print 'output in *.abundtrim'

    if args.savetable:
        print >>sys.stderr, "Saving k-mer counting table to", args.savetable
        ct.save(args.savetable)

コード例 #30

0

ファイルを表示

ファイル: sample-reads-randomly.py プロジェクト: F1000Research/khmer

def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_input_files(_, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be

    if args.output_file:
        output_filename = args.output_file.name
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                print("NOTE: This can be overridden using the --force"
                      " argument", file=sys.stderr)
                sys.exit(1)
    else:
        filename = args.filenames[0]
        if filename in ('/dev/stdin', '-'):
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads, file=sys.stderr)
        print('Subsampled reads will be placed in %s' %
              output_filename, file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,'
              % (args.num_reads, num_samples), ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N'
              % output_filename, file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for n in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)
        screed_iter = screed.open(filename)

        for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader(
                screed_iter,
                force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' %
                          args.max_reads, '(see -M); exiting', file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for n in range(num_samples):
                    reads[n].append((rcrd1, rcrd2))
            else:
                assert len(reads[n]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' %
              (len(reads[0]), output_filename), file=sys.stderr)

        output_file = args.output_file
        if not output_file:
            output_file = open(output_filename, 'wb')

        output_file = get_file_writer(output_file, args.gzip, args.bzip)

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' %
                  (len(reads[n]), n_filename), file=sys.stderr)
            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
                                          args.bzip)
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)

コード例 #31

0

ファイルを表示

def main():
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile
    check_input_files(infile, args.force)
    check_space([infile], args.force)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        infile = '-'
        if not (args.output_paired and args.output_single):
            print("Accepting input from stdin; output filenames must be "
                  "provided.", file=sys.stderr)
            sys.exit(1)
    elif args.output_dir:
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        out1 = args.output_dir + '/' + os.path.basename(infile) + '.se'
        out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe'
    else:
        out1 = os.path.basename(infile) + '.se'
        out2 = os.path.basename(infile) + '.pe'

    # OVERRIDE default output file locations with -p, -s
    if args.output_paired:
        paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip)
        out2 = paired_fp.name
    else:
        # Don't override, just open the default filename from above
        paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
    if args.output_single:
        single_fp = get_file_writer(args.output_single, args.gzip, args.bzip)
        out1 = args.output_single.name
    else:
        # Don't override, just open the default filename from above
        single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)

    print('reading file "%s"' % infile, file=sys.stderr)
    print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr)
    print('outputting orphans to "%s"' % out1, file=sys.stderr)

    n_pe = 0
    n_se = 0

    screed_iter = ReadParser(infile)
    for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index > 0:
            print('...', index, file=sys.stderr)

        if is_pair:
            write_record_pair(read1, read2, paired_fp)
            n_pe += 1
        else:
            write_record(read1, single_fp)
            n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print('DONE; read %d sequences,'
          ' %d pairs and %d singletons' %
          (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr)

    print('wrote to: %s and %s' % (out2, out1),
          file=sys.stderr)

コード例 #32

0

ファイルを表示

ファイル: normalize-by-median.py プロジェクト: trilynn/khmer

def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))

コード例 #33

0

ファイルを表示

def normalize_by_median(input_filename, outfp, htable, args, report_fp=None):

    desired_coverage = args.cutoff
    ksize = htable.ksize()

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    index = -1
    total = 0
    discarded = 0
    for index, batch in enumerate(
            batchwise(screed.open(input_filename, parse_description=False),
                      batch_size)):
        if index > 0 and index % 100000 == 0:
            print >>sys.stderr, '... kept {kept} of {total} or'\
                ' {perc:2}%'.format(kept=total - discarded, total=total,
                                    perc=int(100. - discarded /
                                             float(total) * 100.))
            print >> sys.stderr, '... in file', input_filename

            if report_fp:
                print >> report_fp, total, total - discarded, \
                    1. - (discarded / float(total))
                report_fp.flush()

        total += batch_size

        # If in paired mode, check that the reads are properly interleaved

        if args.paired:
            if not check_is_pair(batch[0], batch[1]):
                raise IOError('Error: Improperly interleaved pairs \
                    {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name))

        # Emit the batch of reads if any read passes the filter
        # and all reads are longer than K
        passed_filter = False
        passed_length = True
        for record in batch:
            if len(record.sequence) < ksize:
                passed_length = False
                continue

            seq = record.sequence.replace('N', 'A')
            med, _, _ = htable.get_median_count(seq)

            if med < desired_coverage:
                htable.consume(seq)
                passed_filter = True

        # Emit records if any passed
        if passed_length and passed_filter:
            for record in batch:
                write_record(record, outfp)
        else:
            discarded += batch_size

    if report_fp:
        print >> report_fp, total, total - discarded, \
            1. - (discarded / float(total))
        report_fp.flush()

    return total, discarded

コード例 #34

0

ファイルを表示

#!/usr/bin/env python

from __future__ import print_function
from khmer.utils import write_record
import screed
import sys

mutations = {
    0: (42681, 10),
}

for n, record in enumerate(screed.open(sys.argv[1])):
    if n in mutations:
        start, dellength = mutations[n]
        seqlength = len(record.sequence)
        piece1 = record.sequence[:start]
        piece2 = record.sequence[start + dellength:]
        record.sequence = piece1 + piece2
        print('DEBUG ', piece1[-9:], '|', piece2[:9], sep='', file=sys.stderr)
        assert len(record.sequence) == seqlength - dellength

    write_record(record, sys.stdout)

コード例 #35

0

ファイルを表示

ファイル: trim-low-abund.py プロジェクト: shannonekj/khmer

def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info(
        'created temporary directory {temp};\n'
        'use -T to change location',
        temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info(
                    "... {filename} {n_saved} {n_reads} {n_bp} "
                    "{w_reads} {w_bp}",
                    filename=filename,
                    n_saved=trimmer.n_saved,
                    n_reads=trimmer.n_reads,
                    n_bp=trimmer.n_bp,
                    w_reads=written_reads,
                    w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename,
                 kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename,
                         c=trimmer.n_saved,
                         d=trimmer.n_reads,
                         e=trimmer.n_bp,
                         f=written_reads,
                         g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total,
             np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads,
             t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0,
             bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped,
                 p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped,
                 bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)

コード例 #36

0

ファイルを表示

ファイル: correct-reads.py プロジェクト: F1000Research/khmer

def main():
    info('correct-reads.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        check_space_for_graph(
            args.n_tables * args.min_tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print >>sys.stderr, 'loading k-mer countgraph from', args.loadgraph
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print >>sys.stderr, 'making k-mer countgraph'
        ct = khmer.new_countgraph(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
            % (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print >>sys.stderr, ('second pass: looking at sequences kept aside '
                             'in %s') % pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename,
                                             parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print >>sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,)
    print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,)
    print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
        (save_pass2_total, n_passes)
    print >>sys.stderr, 'removed %d reads and corrected %d reads (%.2f%%)' % \
        (n_reads - written_reads, corrected_reads, percent_reads_corrected)
    print >>sys.stderr, 'removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
            (n_reads - skipped_n, percent_reads_hicov)
        print >>sys.stderr, ('skipped %d reads/%d bases because of low'
                             'coverage') % (skipped_n, skipped_bp)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    print >>sys.stderr, 'output in *.corr'

    if args.savegraph:
        print >>sys.stderr, "Saving k-mer countgraph to", args.savegraph
        ct.save(args.savegraph)

コード例 #37

0

ファイルを表示

ファイル: split-paired-reads.py プロジェクト: GrahamGoudeau/khmer

def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_input_files(infile, args.force)
    filenames = [infile]
    check_space(filenames, args.force)

    # decide where to put output files - specific directory? or just default?
    if args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        out1 = args.output_first
    if args.output_second:
        out2 = args.output_second

    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None

    screed_iter = screed.open(infile, parse_description=False)

    # walk through all the reads in broken-paired mode.
    for index, is_pair, record1, record2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index:
            print >> sys.stderr, '...', index

        # are we requiring pairs?
        if args.force_paired and not is_pair:
            print >>sys.stderr, 'ERROR, %s is not part of a pair' % \
                record1.name
            sys.exit(1)

        if is_pair:
            write_record(record1, fp_out1)
            counter1 += 1
            write_record(record2, fp_out2)
            counter2 += 1
        else:
            name = record1.name
            if check_is_left(name):
                write_record(record1, fp_out1)
                counter1 += 1
            elif check_is_right(name):
                write_record(record1, fp_out2)
                counter2 += 1
            else:
                print >>sys.stderr, \
                    "Unrecognized format for read pair information: %s" % name
                print >>sys.stderr, "Exiting."
                sys.exit(1)

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (counter1 + counter2, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2

コード例 #38

0

ファイルを表示

ファイル: normalize-by-median.py プロジェクト: Angelfirenze/khmer

def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            print('ERROR: Duplicate filename--Cannot handle this!',
                  file=sys.stderr)
            print('** Exiting!', file=sys.stderr)
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)

    # load or create counting table.
    if args.loadtable:
        print('loading k-mer counting table from ' + args.loadtable,
              file=sys.stderr)
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print('making countgraph', file=sys.stderr)
        htable = khmer_args.create_countgraph(args)

    input_filename = None

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, htable)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for e in filenames:
        files.append([e, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        if args.single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = args.single_output_file.name
        outfp = args.single_output_file

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'w')

        # failsafe context manager in case an input file breaks
        with CatchIOErrors(filename, outfp, args.single_output_file,
                           args.force, corrupt_files):

            screed_iter = screed.open(filename, parse_description=False)
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in WithDiagnostics(filename, norm, reader, report_fp):
                if record is not None:
                    write_record(record, outfp)

            print('output in ' + output_name, file=sys.stderr)
            if output_name is not '/dev/stdout':
                outfp.close()

    # finished - print out some diagnostics.

    print('Total number of unique k-mers: {0}'
          .format(htable.n_unique_kmers()),
          file=sys.stderr)

    if args.savetable:
        print('...saving to ' + args.savetable, file=sys.stderr)
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** I/O Errors occurred in the following files:",
              file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)

コード例 #39

0

ファイルを表示

def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile),
                                       min_length=graph.ksize(),
                                       force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff,
                                        args.variable_coverage,
                                        args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)

コード例 #40

0

ファイルを表示

ファイル: trim-low-abund.py プロジェクト: 52teth/khmer

def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info('created temporary directory {temp};\n'
             'use -T to change location', temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info("... {filename} {n_saved} {n_reads} {n_bp} "
                         "{w_reads} {w_bp}", filename=filename,
                         n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
                         n_bp=trimmer.n_bp, w_reads=written_reads,
                         w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename, kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename, c=trimmer.n_saved,
                         d=trimmer.n_reads, e=trimmer.n_bp,
                         f=written_reads, g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total, np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads, t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped, p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped, bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)

コード例 #41

0

ファイルを表示

def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_file_status(_, args.force)

    check_space(args.filenames, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                sys.exit(1)
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\
            args.num_reads
        print >>sys.stderr, 'Subsampled reads will be placed in %s' % \
            output_filename
        print >> sys.stderr, ''
    else:  # > 1
        print >>sys.stderr, 'Subsampling %d reads, %d times,' \
            % (args.num_reads, num_samples), ' using reservoir sampling.'
        print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \
            % output_filename
        print >> sys.stderr, ''

    reads = []
    for n in range(num_samples):
        reads.append([])

    total = 0

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print >> sys.stderr, 'opening', filename, 'for reading'
        for record in screed.open(filename, parse_description=False):
            total += 1

            if total % 10000 == 0:
                print >> sys.stderr, '...', total, 'reads scanned'
                if total >= args.max_reads:
                    print >>sys.stderr, 'reached upper limit of %d reads' % \
                        args.max_reads, '(see -M); exiting'
                    break

            # collect first N reads
            if total <= args.num_reads:
                for n in range(num_samples):
                    reads[n].append(record)
            else:
                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, total)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = record

    # output all the subsampled reads:
    if len(reads) == 1:
        print >>sys.stderr, 'Writing %d sequences to %s' % \
            (len(reads[0]), output_filename)
        if not output_file:
            output_file = open(output_filename, 'w')

        for record in reads[0]:
            write_record(record, output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print >>sys.stderr, 'Writing %d sequences to %s' % \
                (len(reads[n]), n_filename)
            output_file = open(n_filename, 'w')
            for record in reads[n]:
                write_record(record, output_file)

コード例 #42

0

ファイルを表示

ファイル: extract-partitions.py プロジェクト: qingpeng/khmer

def main():  # pylint: disable=too-many-locals,too-many-branches
    info('extract-partitions.py', ['graph'])
    args = get_parser().parse_args()

    distfilename = args.prefix + '.dist'

    n_unassigned = 0

    for infile in args.part_filenames:
        check_file_status(infile, args.force)

    check_space(args.part_filenames, args.force)

    print >>sys.stderr, '---'
    print >>sys.stderr, 'reading partitioned files:', repr(args.part_filenames)
    if args.output_groups:
        print >>sys.stderr, 'outputting to files named "%s.groupN.fa"' % \
            args.prefix
        print >>sys.stderr, 'min reads to keep a partition:', \
            args.min_part_size
        print >>sys.stderr, 'max size of a group file:', args.max_size
    else:
        print >>sys.stderr, 'NOT outputting groups! Beware!'

    if args.output_unassigned:
        print >>sys.stderr, \
            'outputting unassigned reads to "%s.unassigned.fa"' % \
            args.prefix
    print >>sys.stderr, 'partition size distribution will go to %s' \
        % distfilename
    print >>sys.stderr, '---'

    #

    suffix = 'fa'
    is_fastq = False

    for index, read, pid in read_partition_file(args.part_filenames[0]):
        if hasattr(read, 'accuracy'):
            suffix = 'fq'
            is_fastq = True
        break

    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if is_fastq:
                assert hasattr(read, 'accuracy'), \
                    "all input files must be FASTQ if the first one is"
            else:
                assert not hasattr(read, 'accuracy'), \
                    "all input files must be FASTA if the first one is"

            break

    if args.output_unassigned:
        unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w')

    count = {}
    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if index % 100000 == 0:
                print >>sys.stderr, '...', index

            count[pid] = count.get(pid, 0) + 1

            if pid == 0:
                n_unassigned += 1
                if args.output_unassigned:
                    write_record(read, unassigned_fp)

    if args.output_unassigned:
        unassigned_fp.close()

    if 0 in count:                          # eliminate unpartitioned sequences
        del count[0]

    # develop histogram of partition sizes
    dist = {}
    for pid, size in count.items():
        dist[size] = dist.get(size, 0) + 1

    # output histogram
    distfp = open(distfilename, 'w')

    total = 0
    wtotal = 0
    for counter, index in sorted(dist.items()):
        total += index
        wtotal += counter * index
        distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal))
    distfp.close()

    if not args.output_groups:
        sys.exit(0)

    # sort groups by size
    divvy = sorted(count.items(), key=lambda y: y[1])
    divvy = [y for y in divvy if y[1] > args.min_part_size]

    # divvy up into different groups, based on having max_size sequences
    # in each group.
    total = 0
    group = set()
    group_n = 0
    group_d = {}
    for partition_id, n_reads in divvy:
        group.add(partition_id)
        total += n_reads

        if total > args.max_size:
            for partition_id in group:
                group_d[partition_id] = group_n
                # print 'group_d', partition_id, group_n

            group_n += 1
            group = set()
            total = 0

    if group:
        for partition_id in group:
            group_d[partition_id] = group_n
            # print 'group_d', partition_id, group_n
        group_n += 1

    print >>sys.stderr, '%d groups' % group_n
    if group_n == 0:
        print >>sys.stderr, 'nothing to output; exiting!'
        return

    # open a bunch of output files for the different groups
    group_fps = {}
    for _ in range(group_n):
        group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w')
        group_fps[_] = group_fp

    # write 'em all out!

    total_seqs = 0
    part_seqs = 0
    toosmall_parts = 0
    for filename in args.part_filenames:
        for index, read, partition_id in read_partition_file(filename):
            total_seqs += 1
            if index % 100000 == 0:
                print >>sys.stderr, '...x2', index

            if partition_id == 0:
                continue

            try:
                group_n = group_d[partition_id]
            except KeyError:
                assert count[partition_id] <= args.min_part_size
                toosmall_parts += 1
                continue

            outfp = group_fps[group_n]

            write_record(read, outfp)
            part_seqs += 1

    print >>sys.stderr, '---'
    print >>sys.stderr, 'Of %d total seqs,' % total_seqs
    print >>sys.stderr, 'extracted %d partitioned seqs into group files,' % \
        part_seqs
    print >>sys.stderr, \
        'discarded %d sequences from small partitions (see -m),' % \
        toosmall_parts
    print >>sys.stderr, 'and found %d unpartitioned sequences (see -U).' % \
        n_unassigned
    print >>sys.stderr, ''
    print >>sys.stderr, 'Created %d group files named %s.groupXXXX.%s' % \
        (len(group_fps),
         args.prefix,
         suffix)

コード例 #43

0

ファイルを表示

ファイル: sample-reads-randomly.py プロジェクト: zhaijj/khmer

def main():
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        "`reservoir sampling\n"
        "<http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.",
        "reservoir sampling algorithm. "
        "http://en.wikipedia.org/wiki/Reservoir_sampling")
    args = sanitize_help(parser).parse_args()

    for name in args.filenames:
        check_input_files(name, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be

    if args.output_file:
        output_filename = args.output_file.name
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                print(
                    "NOTE: This can be overridden using the --force"
                    " argument",
                    file=sys.stderr)
                sys.exit(1)
    else:
        filename = args.filenames[0]
        if filename in ('/dev/stdin', '-'):
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)
        output_filename = os.path.basename(filename) + '.subset'

    filename = args.filenames[0]
    if filename in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        filename = '-'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads,
              file=sys.stderr)
        print('Subsampled reads will be placed in %s' % output_filename,
              file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,' %
              (args.num_reads, num_samples),
              ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N' % output_filename,
              file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for _ in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)

        for count, (_, _, rcrd1, rcrd2) in enumerate(
                broken_paired_reader(ReadParser(filename),
                                     force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' % args.max_reads,
                          '(see -M); exiting',
                          file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for sample in range(num_samples):
                    reads[sample].append((rcrd1, rcrd2))
            else:
                for sample in range(num_samples):
                    assert len(reads[sample]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' % (len(reads[0]), output_filename),
              file=sys.stderr)

        output_file = args.output_file
        if not output_file:
            output_file = open(output_filename, 'wb')

        output_file = get_file_writer(output_file, args.gzip, args.bzip)

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' % (len(reads[n]), n_filename),
                  file=sys.stderr)
            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
                                          args.bzip)
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)

コード例 #44

0

ファイルを表示

ファイル: sample-reads-randomly.py プロジェクト: qingpeng/khmer

def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_file_status(_, args.force)

    check_space(args.filenames, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                sys.exit(1)
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' % \
            args.num_reads
        print >>sys.stderr, 'Subsampled reads will be placed in %s' % \
            output_filename
        print >>sys.stderr, ''
    else:  # > 1
        print >>sys.stderr, 'Subsampling %d reads, %d times,' \
            % (args.num_reads, num_samples), ' using reservoir sampling.'
        print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \
            % output_filename
        print >>sys.stderr, ''

    reads = []
    for n in range(num_samples):
        reads.append([])

    total = 0

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print >>sys.stderr, 'opening', filename, 'for reading'
        for record in screed.open(filename):
            total += 1

            if total % 10000 == 0:
                print >>sys.stderr, '...', total, 'reads scanned'
                if total >= args.max_reads:
                    print >>sys.stderr, 'reached upper limit of %d reads',\
                        ' (see -M); exiting' \
                        % args.max_reads
                    break

            # collect first N reads
            if total <= args.num_reads:
                for n in range(num_samples):
                    reads[n].append(record)
            else:
                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, total)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = record

    # output all the subsampled reads:
    if len(reads) == 1:
        print >>sys.stderr, 'Writing %d sequences to %s' % \
            (len(reads[0]), output_filename)
        if not output_file:
            output_file = open(output_filename, 'w')

        for record in reads[0]:
            write_record(record, output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print >>sys.stderr, 'Writing %d sequences to %s' % \
                (len(reads[n]), n_filename)
            output_file = open(n_filename, 'w')
            for record in reads[n]:
                write_record(record, output_file)

コード例 #45

0

ファイルを表示

ファイル: normalize-by-median.py プロジェクト: GrahamGoudeau/khmer

def normalize_by_median(input_filename, outfp, htable, paired, cutoff,
                        report_fp=None):

    desired_coverage = cutoff
    ksize = htable.ksize()

    # In paired mode we read two records at a time
    batch_size = 1
    if paired:
        batch_size = 2

    index = -1
    total = 0
    discarded = 0
    for index, batch in enumerate(batchwise(screed.open(
            input_filename, parse_description=False), batch_size)):
        if index > 0 and index % 100000 == 0:
            print >>sys.stderr, '... kept {kept} of {total} or'\
                ' {perc:2}%'.format(kept=total - discarded, total=total,
                                    perc=int(100. - discarded /
                                             float(total) * 100.))
            print >>sys.stderr, '... in file', input_filename

            if report_fp:
                print >> report_fp, total, total - discarded, \
                    1. - (discarded / float(total))
                report_fp.flush()

        total += batch_size

        # If in paired mode, check that the reads are properly interleaved

        if paired:
            if not check_is_pair(batch[0], batch[1]):
                raise IOError('Error: Improperly interleaved pairs \
                    {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name))

        # Emit the batch of reads if any read passes the filter
        # and all reads are longer than K
        passed_filter = False
        passed_length = True
        for record in batch:
            if len(record.sequence) < ksize:
                passed_length = False
                continue

            seq = record.sequence.replace('N', 'A')
            med, _, _ = htable.get_median_count(seq)

            if med < desired_coverage:
                htable.consume(seq)
                passed_filter = True

        # Emit records if any passed
        if passed_length and passed_filter:
            for record in batch:
                write_record(record, outfp)
        else:
            discarded += batch_size

    if report_fp:
        print >> report_fp, total, total - discarded, \
            1. - (discarded / float(total))
        report_fp.flush()

    return total, discarded

コード例 #46

0

ファイルを表示

def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            print('ERROR: Duplicate filename--Cannot handle this!',
                  file=sys.stderr)
            print('** Exiting!', file=sys.stderr)
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    # load or create counting table.
    if args.loadtable:
        print('loading k-mer counting table from ' + args.loadtable,
              file=sys.stderr)
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print('making k-mer counting table', file=sys.stderr)
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    input_filename = None

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, htable)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for e in filenames:
        files.append([e, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        if args.single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = args.single_output_file.name
        outfp = args.single_output_file

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'w')

        # failsafe context manager in case an input file breaks
        with CatchIOErrors(filename, outfp, args.single_output_file,
                           args.force, corrupt_files):

            screed_iter = screed.open(filename, parse_description=False)
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in WithDiagnostics(filename, norm, reader, report_fp):
                if record is not None:
                    write_record(record, outfp)

            print('output in ' + output_name, file=sys.stderr)
            if output_name is not '/dev/stdout':
                outfp.close()

    # finished - print out some diagnostics.

    print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
          file=sys.stderr)

    if args.savetable:
        print('...saving to ' + args.savetable, file=sys.stderr)
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** IOErrors occurred in the following files:", file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)

コード例 #47

0

ファイルを表示

ファイル: sweep-reads.py プロジェクト: BravianZhao/khmer

def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = get_parser()
    args = parser.parse_args()

    if args.min_tablesize < MIN_HSIZE:
        args.min_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype='hashbits')

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_input_files(args.input_fastp, args.force)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files, args.force)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = ix.next()
    del ix

    extension = 'fa'
    if hasattr(record, 'quality'):      # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(
        max_buffers, max_reads, buf_size, output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print >>sys.stderr, 'consuming input sequences...'
        if args.label_by_pid:
            print >>sys.stderr, '...labeling by partition id (pid)'
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print >>sys.stderr, '...labeling by sequence'
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print >>sys.stderr, \
                        '...consumed {n} sequences...'.format(n=n)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print >>sys.stderr, \
                '...labeling to create groups of size {s}'.format(
                    s=args.group_size)
            label = -1
            g = 0
            try:
                outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                            g=g,
                                                            ext=extension
                                                            ), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open('{pref}_base_{g}.{ext}'.format(
                                pref=output_pref, g=g,
                                ext=extension), 'wb')
                    if n % 50000 == 0:
                        print >>sys.stderr, \
                            '...consumed {n} sequences...'.format(n=n)
                    ht.consume_sequence_and_tag_with_labels(record.sequence,
                                                            label)

                    write_record(record, outfp)
 
            except IOError as e:
                print >>sys.stderr, '!! ERROR !!', e
                print >>sys.stderr, '...error splitting input. exiting...'

    except IOError as e:
        print >>sys.stderr, '!! ERROR: !!', e
        print >>sys.stderr, '...error consuming \
                            {i}. exiting...'.format(i=input_fastp)

    print >>sys.stderr, 'done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.n_tags(), l=ht.n_labels())

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print >>sys.stderr, '** sweeping {read_file} for labels...'.format(
            read_file=read_file)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print >>sys.stderr, '!! ERROR: !!', error
            print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(
                fn=read_file)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'quality'):
                        seq_str = fmt_fastq(name, seq, record.quality, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print >>sys.stderr, '** End of run...'
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print >>sys.stderr, '! WARNING: Sweep finished with errors !'
        print >>sys.stderr, '** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors)
        print >>sys.stderr, '** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors)

    print >>sys.stderr, 'swept {n_reads} for labels...'.format(
        n_reads=n_labeled + n_orphaned)
    print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
        nc=n_labeled, no=n_orphaned)
    print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)

    print >>sys.stderr, '** outputting label number distribution...'
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'wb') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print >>sys.stderr, '** outputting label read counts...'
    with open(fn, 'wb') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))

コード例 #48

0

ファイルを表示

ファイル: normalize-by-median.py プロジェクト: betatim/khmer

def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph)
        countgraph = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))

コード例 #49

0

ファイルを表示

def main():
    info('split-paired-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    basename = os.path.basename(infile)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        if not (args.output_first and args.output_second):
            print(
                "Accepting input from stdin; "
                "output filenames must be provided.",
                file=sys.stderr)
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = os.path.join(args.output_directory, basename + '.1')
        out2 = os.path.join(args.output_directory, basename + '.2')
    else:
        out1 = basename + '.1'
        out2 = basename + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
    if args.output_second:
        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)

    # put orphaned reads here, if -0!
    if args.output_orphaned:
        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
        out0 = describe_file_handle(args.output_orphaned)

    counter1 = 0
    counter2 = 0
    counter3 = 0
    index = None

    screed_iter = screed.open(infile)

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(screed_iter,
                                       require_paired=not args.output_orphaned)

    try:
        for index, is_pair, record1, record2 in paired_iter:
            if index % 10000 == 0:
                print('...', index, file=sys.stderr)

            if is_pair:
                write_record(record1, fp_out1)
                counter1 += 1
                write_record(record2, fp_out2)
                counter2 += 1
            elif args.output_orphaned:
                write_record(record1, fp_out0)
                counter3 += 1
    except UnpairedReadsError as e:
        print("Unpaired reads found starting at {name}; exiting".format(
            name=e.read1.name),
              file=sys.stderr)
        sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
          (counter1 + counter2, counter1, counter2, counter3),
          file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
    if args.output_orphaned:
        print("orphans in %s" % out0, file=sys.stderr)

コード例 #50

0

ファイルを表示

ファイル: sample-reads-randomly.py プロジェクト: gsc0107/khmer

def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_input_files(_, args.force)

    check_space(args.filenames, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                sys.exit(1)
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads, file=sys.stderr)
        print('Subsampled reads will be placed in %s' %
              output_filename, file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,'
              % (args.num_reads, num_samples), ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N'
              % output_filename, file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for n in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)
        screed_iter = screed.open(filename, parse_description=False)

        for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader(
                screed_iter,
                force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' %
                          args.max_reads, '(see -M); exiting', file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for n in range(num_samples):
                    reads[n].append((rcrd1, rcrd2))
            else:
                assert len(reads[n]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' %
              (len(reads[0]), output_filename), file=sys.stderr)
        if not output_file:
            output_file = open(output_filename, 'w')

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' %
                  (len(reads[n]), n_filename), file=sys.stderr)
            output_file = open(n_filename, 'w')
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)

コード例 #51

0

ファイルを表示

def main():
    info('trim-low-abund.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadtable:
        print >> sys.stderr, 'loading k-mer counting table from', args.loadtable
        ct = khmer.load_counting_hash(args.loadtable)
    else:
        print >> sys.stderr, 'making k-mer counting table'
        ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            trimfp = open(os.path.basename(filename) + '.abundtrim', 'w')
        else:
            trimfp = args.out

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
            % (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print >> sys.stderr, ('second pass: looking at sequences kept aside '
                              'in %s') % pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(
                screed.open(pass2filename, parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print >> sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >> sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print >> sys.stderr, 'read %d reads, %d bp' % (
        n_reads,
        n_bp,
    )
    print >> sys.stderr, 'wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    )
    print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
        (save_pass2_total, n_passes)
    print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \
        (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)
    print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
            (n_reads - skipped_n, percent_reads_hicov)
        print >> sys.stderr, ('skipped %d reads/%d bases because of low'
                              'coverage') % (skipped_n, skipped_bp)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    print >> sys.stderr, 'output in *.abundtrim'

    if args.savetable:
        print >> sys.stderr, "Saving k-mer counting table to", args.savetable
        ct.save(args.savetable)

コード例 #52

0

ファイルを表示

ファイル: filter2.py プロジェクト: sermare/diffhash

def main():  # pylint: disable=too-many-branches,too-many-statements
    start_time = time.time()
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph1 = Countgraph.load(args.loadgraph)

    # load second counting table.
    if args.loadgraph2:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph2)
        countgraph2 = Countgraph.load(args.loadgraph2)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        screed_iter = clean_input_reads(screed.open(filename))
        reader = broken_paired_reader(screed_iter,
                                      min_length=args.ksize,
                                      force_single=force_single,
                                      require_paired=require_paired)

        # actually do diginorm
        for _, is_paired, read0, read1 in reader:
            for record in snarf(is_paired, read0, read1, countgraph1,
                                countgraph2):
                if record is not None:
                    write_record(record, outfp)

    print("--- %s seconds ---" % (time.time() - start_time))

コード例 #53

0

ファイルを表示

ファイル: split-paired-reads.py プロジェクト: GrahamGoudeau/khmer

def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_input_files(infile, args.force)
    filenames = [infile]
    check_space(filenames, args.force)

    # decide where to put output files - specific directory? or just default?
    if args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        out1 = args.output_first
    if args.output_second:
        out2 = args.output_second

    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None

    screed_iter = screed.open(infile, parse_description=False)

    # walk through all the reads in broken-paired mode.
    for index, is_pair, record1, record2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index:
            print >> sys.stderr, '...', index

        # are we requiring pairs?
        if args.force_paired and not is_pair:
            print >>sys.stderr, 'ERROR, %s is not part of a pair' % \
                record1.name
            sys.exit(1)

        if is_pair:
            write_record(record1, fp_out1)
            counter1 += 1
            write_record(record2, fp_out2)
            counter2 += 1
        else:
            name = record1.name
            if check_is_left(name):
                write_record(record1, fp_out1)
                counter1 += 1
            elif check_is_right(name):
                write_record(record1, fp_out2)
                counter2 += 1
            else:
                print >>sys.stderr, \
                    "Unrecognized format for read pair information: %s" % name
                print >> sys.stderr, "Exiting."
                sys.exit(1)

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (counter1 + counter2, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2

コード例 #54

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('database')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-p',
                        '--paired',
                        action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force_single',
                        dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u',
                        '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-f',
                        '--force',
                        dest='force',
                        help='continue past file reading errors',
                        action='store_true')
    args = parser.parse_args()

    force_single = args.force_single

    #if args.reads == '-':
    #    args.reads = sys.stdin

    # check that input files exist
    check_valid_file_exists(args.input_filenames)

    filenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    # create object of Nodetable in Khmer to use its
    kh = khmer.Nodetable(args.ksize, 1, 1)

    # load database
    mphf_filename = args.database + '.mphf'
    array_filename = args.database + '.arr'
    print('loading database {}'.format(args.database))

    with open(array_filename, 'rb') as fp:
        mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load(
            fp)
    mphf = bbhash.load_mphf(mphf_filename)

    print('done!')

    def get_kmer_to_family_ids(hashval):
        mphf_hash = mphf.lookup(hashval)
        if mphf_hash is None:
            return set()

        kmer_hash = mphf_to_kmer[mphf_hash]
        if kmer_hash != hashval:
            return set()

        cdbg_id = mphf_to_cdbg[mphf_hash]
        id_list = cdbg_to_family_id[cdbg_id]
        return id_list

    def readFusion(read):
        global n_unmatched, n_same, n_amb_same, n_clear_fusion, n_ambig_fusion, n_mutli_fusion
        flag = None
        lf_ids = set()
        rt_ids = set()
        families = []
        shared_kmers = []
        gaps = []

        hashvals = kh.get_kmer_hashes(read.sequence)

        # find a matching k-mer at the beginning of the read
        lf = hashvals[0]
        lf_ids = get_kmer_to_family_ids(lf)
        idx = 1
        while idx < len(hashvals) and len(lf_ids) == 0:
            lf = hashvals[idx]
            lf_ids = get_kmer_to_family_ids(lf)
            idx += 1

        if len(lf_ids) == 0:
            #print('no single match')
            n_unmatched += 1
            flag = "unmatched"
        elif idx == len(hashvals):
            #print('same, only last kmer matched')
            families.append(lf_ids)
            if len(lf_ids) == 1:
                n_same += 1
                flag = "unique"
            else:
                n_amb_same += 1
                flag = "ambiguous"
        else:  # len(lf_ids) > 0 & idx < len(hashvals)
            # find a matching k-mer at the end of the read
            rt = hashvals[-1]
            rt_ids = get_kmer_to_family_ids(rt)
            idy = len(hashvals) - 2
            while idy >= idx and len(rt_ids) == 0:
                rt = hashvals[idy]
                rt_ids = get_kmer_to_family_ids(rt)
                idy -= 1

            if len(rt_ids) == 0:
                #print('same, only one non-last kmer matched ')
                families.append(lf_ids)
                if len(lf_ids) == 1:
                    n_same += 1
                    flag = "unique"
                else:
                    n_amb_same += 1
                    flag = "ambiguous"
            else:
                intersect_ids = lf_ids.intersection(rt_ids)
                if len(intersect_ids) > 0:
                    families.append(intersect_ids)
                    if len(intersect_ids) == 1:
                        n_same += 1
                        flag = "unique"
                    else:
                        n_amb_same += 1
                        flag = "ambiguous"
                else:  # fusion to be resolved
                    shared_kmer = 1
                    gap_size = 0
                    gap = False
                    while idx <= idy + 1:
                        temp = hashvals[idx]
                        temp_ids = get_kmer_to_family_ids(temp)
                        if len(temp_ids) > 0:
                            intersect_ids = lf_ids.intersection(temp_ids)
                            if len(intersect_ids) > 0:
                                lf_ids = intersect_ids
                                shared_kmer += 1
                                gap_size = 0
                            else:  # len(intersect_ids) == 0
                                families.append(lf_ids)
                                shared_kmers.append(shared_kmer)
                                lf_ids = temp_ids
                                shared_kmer = 1
                                gaps.append(gap_size)
                                gap_size = 0
                        else:
                            gap_size += 1
                        idx += 1

                    families.append(lf_ids)
                    shared_kmers.append(shared_kmer)

                    assert len(families) > 1
                    if len(families) == 2:
                        if len(families[0]) == 1 and len(families[1]) == 1:
                            n_clear_fusion += 1
                            flag = "clear_fusion"
                        else:
                            n_ambig_fusion += 1
                            flag = "ambig_fusion"
                    else:  # len(families) > 2
                        n_mutli_fusion += 1
                        flag = "multi_fusion"

        #if len(families) == 0:
        #    families = "-"

        #if len(shared_kmers) == 0:
        #    shared_kmers = "-"

        return flag, families, shared_kmers, gaps

    fusion_filename = args.database + '_fusion.fa'
    fusion_fp = open(fusion_filename, 'w')
    fusionInfo_filename = args.database + '_fusion.info'
    fusionInfo_fp = open(fusionInfo_filename, 'w')
    print("fileName",
          "recordIndex",
          "whichInPair",
          "align_class",
          "gene_families",
          "shared_kmers",
          "gaps",
          file=fusionInfo_fp,
          sep='\t')
    fusionCalc_filename = args.database + '_fusion.calc'
    fusionCalc_fp = open(fusionCalc_filename, 'w')
    print("fileName",
          "recordIndex",
          "whichInPair",
          "align_class",
          "familiy_A",
          "familiy_B",
          "no_families",
          "len_families",
          "shared_kmers",
          "gaps",
          "sorted_keys",
          file=fusionCalc_fp,
          sep='\t')

    fusionPairs_filename = args.database + '_fusionPairs.fa'
    fusPair_fp = open(fusionPairs_filename, 'w')
    fusionPairsInfo_filename = args.database + '_fusionPairs.info'
    fusPairInfo_fp = open(fusionPairsInfo_filename, 'w')
    print("fileName",
          "recordIndex",
          "fusion_class",
          "R1_family",
          "R2_family",
          file=fusPairInfo_fp,
          sep='\t')
    fusionPairsCalc_filename = args.database + '_fusionPairs.calc'
    fusPairCalc_fp = open(fusionPairsCalc_filename, 'w')
    print("fileName",
          "recordIndex",
          "fusion_class",
          "familiy_A",
          "familiy_B",
          "len_families",
          "sorted_keys",
          file=fusPairCalc_fp,
          sep='\t')

    corrupt_files = []
    family_names = dict(zip(family_ids.values(), family_ids.keys()))
    n = 0
    n_paired_fusion = 0
    sameRef = ("unique", "ambiguous")
    fusion = ("clear_fusion", "ambig_fusion", "multi_fusion")
    for filename, require_paired in files:
        with catch_io_errors(filename, fusion_fp, fusionInfo_fp, fusionCalc_fp,
                             fusPair_fp, fusPairInfo_fp, fusPairCalc_fp,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            for r_index, is_paired, read0, read1 in reader:
                n += 1
                if n % 10000 == 0:
                    print('...', n)
                    #if n > 5000:
                    #    break

                flag0, families0, shared_kmers0, gaps0 = readFusion(read0)

                if not is_paired and flag0 in fusion:
                    #families_names0 = []
                    #for gp in families0:
                    #    gp_names = []
                    #    for family_id in gp:
                    #        family_name = family_names[family_id]
                    #        gp_names.append(family_name)
                    #    families_names0.append(gp_names)

                    print(filename,
                          r_index,
                          "single",
                          flag0,
                          families0,
                          shared_kmers0,
                          gaps0,
                          file=fusionInfo_fp,
                          sep='\t')
                    write_record(read0, fusion_fp)

                    #i = 1
                    #while i < len(families0):
                    #    for g1 in families0[i-1]:
                    #        for g2 in families0[i]:
                    #            print(filename, r_index, "single", flag0, sorted([g1,g2]), len(families0), len(families0[i-1]), len(families0[i]),
                    #                  shared_kmers0, gaps0, file=fusionCalc_fp, sep='\t')
                    #    i += 1

                    i = len(families0) - 1
                    for g1 in families0[0]:
                        g1_name = family_names[g1]
                        for g2 in families0[i]:
                            g2_name = family_names[g2]
                            print(filename,
                                  r_index,
                                  "single",
                                  flag0,
                                  '{}:{}'.format(g1, g1_name),
                                  '{}:{}'.format(g2, g2_name),
                                  len(families0), [len(f) for f in families0],
                                  shared_kmers0,
                                  gaps0,
                                  sorted([g1, g2]),
                                  file=fusionCalc_fp,
                                  sep='\t')

                if is_paired:
                    flag1, families1, shared_kmers1, gaps1 = readFusion(read1)

                    if flag0 in fusion or flag1 in fusion:
                        print(filename,
                              r_index,
                              "Read_1",
                              flag0,
                              families0,
                              shared_kmers0,
                              gaps0,
                              file=fusionInfo_fp,
                              sep='\t')
                        write_record(read0, fusion_fp)
                        print(filename,
                              r_index,
                              "Read_2",
                              flag1,
                              families1,
                              shared_kmers1,
                              gaps1,
                              file=fusionInfo_fp,
                              sep='\t')
                        write_record(read1, fusion_fp)

                        if flag0 in fusion:
                            i = len(families0) - 1
                            for g1 in families0[0]:
                                g1_name = family_names[g1]
                                for g2 in families0[i]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          "Read_1",
                                          flag0,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name),
                                          len(families0),
                                          [len(f) for f in families0],
                                          shared_kmers0,
                                          gaps0,
                                          sorted([g1, g2]),
                                          file=fusionCalc_fp,
                                          sep='\t')

                        if flag1 in fusion:
                            i = len(families1) - 1
                            for g1 in families1[0]:
                                g1_name = family_names[g1]
                                for g2 in families1[i]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          "Read_2",
                                          flag1,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name),
                                          len(families1),
                                          [len(f) for f in families1],
                                          shared_kmers1,
                                          gaps1,
                                          sorted([g1, g2]),
                                          file=fusionCalc_fp,
                                          sep='\t')

                    elif flag0 in sameRef and flag1 in sameRef:
                        if len(families0[0].intersection(families1[0])) == 0:
                            n_paired_fusion += 1

                            if flag0 == "unique" and flag1 == "unique":
                                fusion_class = "clear_fusion"
                            else:
                                fusion_class = "ambig_fusion"

                            print(filename,
                                  r_index,
                                  fusion_class,
                                  families0,
                                  families1,
                                  file=fusPairInfo_fp,
                                  sep='\t')
                            write_record(read0, fusPair_fp)
                            write_record(read1, fusPair_fp)

                            for g1 in families0[0]:
                                g1_name = family_names[g1]
                                for g2 in families1[0]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          fusion_class,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name), [
                                              len(f) for f in (families0[0],
                                                               families1[0])
                                          ],
                                          sorted([g1, g2]),
                                          file=fusPairCalc_fp,
                                          sep='\t')

    print('No of input fragments: ', n)
    print('unmatched:', n_unmatched)
    print('Unique:', n_same)
    print('Ambiguous:', n_amb_same)
    print('Single read clear fusion:', n_clear_fusion)
    print('Single read ambiguous fusion:', n_ambig_fusion)
    print('Single read multi fusion:', n_mutli_fusion)
    print('paired read fusion:', n_paired_fusion)

コード例 #55

0

ファイルを表示

ファイル: sweep-reads.py プロジェクト: ofanoyi/khmer

def main():
    info("sweep-reads-buffered.py", ["sweep"])
    parser = sanitize_epilog(get_parser())
    args = parser.parse_args()

    if args.max_tablesize < MAX_HSIZE:
        args.max_tablesize = MAX_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, graphtype="nodegraph")

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_input_files(args.input_fastp, args.force)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files, args.force)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = next(ix)
    del ix

    extension = "fa"
    if hasattr(record, "quality"):  # fastq!
        extension = "fq"

    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.GraphLabels(K, HT_SIZE, N_HT)
    try:
        print("consuming input sequences...", file=sys.stderr)
        if args.label_by_pid:
            print("...labeling by partition id (pid)", file=sys.stderr)
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print("...labeling by sequence", file=sys.stderr)
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print("...consumed {n} sequences...".format(n=n), file=sys.stderr)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print("...labeling to create groups of size {s}".format(s=args.group_size), file=sys.stderr)
            label = -1
            g = 0
            try:
                outfp = open("{pref}_base_{g}.{ext}".format(pref=output_pref, g=g, ext=extension), "wb")
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open("{pref}_base_{g}.{ext}".format(pref=output_pref, g=g, ext=extension), "wb")
                    if n % 50000 == 0:
                        print("...consumed {n} sequences...".format(n=n), file=sys.stderr)
                    ht.consume_sequence_and_tag_with_labels(record.sequence, label)

                    write_record(record, outfp)

            except (IOError, OSError) as e:
                print("!! ERROR !!", e, file=sys.stderr)
                print("...error splitting input. exiting...", file=sys.stderr)

    except (IOError, OSError) as e:
        print("!! ERROR: !!", e, file=sys.stderr)
        print(
            "...error consuming \
                            {i}. exiting...".format(
                i=input_fastp
            ),
            file=sys.stderr,
        )

    print(
        "done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...".format(
            t=ht.graph.n_tags(), l=ht.n_labels()
        )
    )

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print("** sweeping {read_file} for labels...".format(read_file=read_file), file=sys.stderr)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except (IOError, OSError) as error:
            print("!! ERROR: !!", error, file=sys.stderr)
            print("*** Could not open {fn}, skipping...".format(fn=read_file), file=sys.stderr)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print(
                        "\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)".format(
                            n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t
                        ),
                        file=sys.stderr,
                    )
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, "quality"):
                        seq_str = fmt_fastq(name, seq, record.quality, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, "multi")
                            n_mlabeled += 1
                            label_dict["multi"] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, "orphaned")
                        label_dict["orphaned"] += 1
            print("** End of file {fn}...".format(fn=read_file), file=sys.stderr)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print("** End of run...", file=sys.stderr)
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print("! WARNING: Sweep finished with errors !", file=sys.stderr)
        print("** {writee} reads not written".format(writee=output_buffer.num_write_errors), file=sys.stderr)
        print("** {filee} errors opening files".format(filee=output_buffer.num_file_errors), file=sys.stderr)

    print("swept {n_reads} for labels...".format(n_reads=n_labeled + n_orphaned), file=sys.stderr)
    print("...with {nc} labeled and {no} orphaned".format(nc=n_labeled, no=n_orphaned), file=sys.stderr)
    print("...and {nmc} multilabeled".format(nmc=n_mlabeled), file=sys.stderr)

    print("** outputting label number distribution...", file=sys.stderr)
    fn = os.path.join(outdir, "{pref}.dist.txt".format(pref=output_pref))
    with open(fn, "w", encoding="utf-8") as outfp:
        for nc in label_number_dist:
            outfp.write("{nc}\n".format(nc=nc))

    fn = os.path.join(outdir, "{pref}.counts.csv".format(pref=output_pref))
    print("** outputting label read counts...", file=sys.stderr)
    with open(fn, "w", encoding="utf-8") as outfp:
        for k in label_dict:
            outfp.write("{l},{c}\n".format(l=k, c=label_dict[k]))

コード例 #56

0

ファイルを表示

ファイル: trim-low-abund.py プロジェクト: crivaldi/CRC

def main():
    info('trim-low-abund.py', ['streaming'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    if args.loadgraph:
        print('loading countgraph from', args.loadgraph, file=sys.stderr)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print('making countgraph', file=sys.stderr)
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; '
          'use -T to change location' % tempdir, file=sys.stderr)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.output is None:
            trimfp = get_file_writer(open(os.path.basename(filename) +
                                          '.abundtrim', 'wb'),
                                     args.gzip, args.bzip)
        else:
            trimfp = get_file_writer(args.output, args.gzip, args.bzip)

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...', n, filename, save_pass2, n_reads, n_bp,
                      written_reads, written_bp, file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print('second pass: looking at sequences kept aside in %s' %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print('... x 2', n, pass2filename,
                      written_reads, written_bp, file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr)
    print('wrote %d reads, %d bp' % (written_reads, written_bp,),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' % (save_pass2_total,
                                                      n_passes),
          file=sys.stderr)
    print('removed %d reads and trimmed %d reads (%.2f%%)' %
          (n_reads - written_reads, trimmed_reads, percent_reads_trimmed),
          file=sys.stderr)
    print('trimmed or removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
                                                         percent_reads_hicov),
              file=sys.stderr)
        print('skipped %d reads/%d bases because of low coverage' %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.abundtrim', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to",
              args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)

コード例 #57

0

ファイルを表示

def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = get_parser()
    args = parser.parse_args()

    if args.max_tablesize < MAX_HSIZE:
        args.max_tablesize = MAX_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype='nodegraph')

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_input_files(args.input_fastp, args.force)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files, args.force)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = next(ix)
    del ix

    extension = 'fa'
    if hasattr(record, 'quality'):  # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size,
                                      output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print('consuming input sequences...', file=sys.stderr)
        if args.label_by_pid:
            print('...labeling by partition id (pid)', file=sys.stderr)
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print('...labeling by sequence', file=sys.stderr)
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print('...consumed {n} sequences...'.format(n=n),
                          file=sys.stderr)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print('...labeling to create groups of size {s}'.format(
                s=args.group_size),
                  file=sys.stderr)
            label = -1
            g = 0
            try:
                outfp = open(
                    '{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                   g=g,
                                                   ext=extension), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open(
                                '{pref}_base_{g}.{ext}'.format(
                                    pref=output_pref, g=g, ext=extension),
                                'wb')
                    if n % 50000 == 0:
                        print('...consumed {n} sequences...'.format(n=n),
                              file=sys.stderr)
                    ht.consume_sequence_and_tag_with_labels(
                        record.sequence, label)

                    write_record(record, outfp)

            except (IOError, OSError) as e:
                print('!! ERROR !!', e, file=sys.stderr)
                print('...error splitting input. exiting...', file=sys.stderr)

    except (IOError, OSError) as e:
        print('!! ERROR: !!', e, file=sys.stderr)
        print('...error consuming \
                            {i}. exiting...'.format(i=input_fastp),
              file=sys.stderr)

    print('done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.graph.n_tags(),
                                          l=ht.n_labels()))

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print('** sweeping {read_file} for labels...'.format(
            read_file=read_file),
              file=sys.stderr)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except (IOError, OSError) as error:
            print('!! ERROR: !!', error, file=sys.stderr)
            print('*** Could not open {fn}, skipping...'.format(fn=read_file),
                  file=sys.stderr)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print('\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t), file=sys.stderr)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'quality'):
                        seq_str = fmt_fastq(name, seq, record.quality, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print('** End of file {fn}...'.format(fn=read_file),
                  file=sys.stderr)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print('** End of run...', file=sys.stderr)
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print('! WARNING: Sweep finished with errors !', file=sys.stderr)
        print('** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors),
              file=sys.stderr)
        print('** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors),
              file=sys.stderr)

    print('swept {n_reads} for labels...'.format(n_reads=n_labeled +
                                                 n_orphaned),
          file=sys.stderr)
    print('...with {nc} labeled and {no} orphaned'.format(nc=n_labeled,
                                                          no=n_orphaned),
          file=sys.stderr)
    print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr)

    print('** outputting label number distribution...', file=sys.stderr)
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'w', encoding='utf-8') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print('** outputting label read counts...', file=sys.stderr)
    with open(fn, 'w', encoding='utf-8') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))