Esempio n. 1
0
def test_read_pair_iterator_in_error_mode():
    assert 0

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-paired.fa"))

    # If walks like an iterator and quacks like an iterator...
    rpi = rparser.iter_read_pairs()
    assert "__iter__" in dir(rpi)
    assert "next" in dir(rpi)

    # Are the alleged pairs actually pairs?
    read_pairs_1 = []
    for read_1, read_2 in rpi:
        read_pairs_1.append([read_1, read_2])
        assert read_1.name[: 19] == read_2.name[: 19]

    # Reload parser.
    # Note: No 'rewind' or 'reset' capability at the time of this writing.
    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-paired.fa"))

    # Ensure that error mode is the default mode.
    read_pairs_2 = []
    for read_1, read_2 \
            in rparser.iter_read_pairs(ReadParser.PAIR_MODE_ERROR_ON_UNPAIRED):
        read_pairs_2.append([read_1, read_2])
    matches = \
        map(
            lambda rp1, rp2: rp1[0].name == rp2[0].name,
            read_pairs_1, read_pairs_2
        )
    assert all(matches)  # Assert ALL the matches. :-]
Esempio n. 2
0
def test_iternext():
    rparser = ReadParser(utils.get_test_data("fakelump.fa.stoptags.txt"))
    read_pairs = []
    try:
        for read_1, read_2 in rparser.iter_read_pairs():
            read_pairs.append(read_1, read_2)
        assert 0, "Shouldn't be able to iterate over non FASTA file"
    except IOError, err:
        print str(err)
Esempio n. 3
0
def test_bzip2_decompression_truncated_pairiter():

    rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.bz2"))
    try:
        for read in rparser.iter_read_pairs():
            pass
        assert 0, "this should fail"
    except IOError as err:
        print str(err)
Esempio n. 4
0
def test_read_pair_iterator_in_error_mode_xfail( ):

    rparser = \
    ReadParser( utils.get_test_data( "test-abund-read-impaired.fa" ) )

    failed = True
    try:
        for rpair in rparser.iter_read_pairs( ): pass
        failed = False
    except IOError as exc: pass
    assert failed
Esempio n. 5
0
def test_gzip_decompression_truncated_pairiter():

    rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.gz"))
    try:
        for _ in rparser.iter_read_pairs():
            pass
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
Esempio n. 6
0
def test_read_pair_iterator_in_ignore_mode():
    assert 0

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-impaired.fa"))

    read_pairs = []
    for read_1, read_2 \
            in rparser.iter_read_pairs(ReadParser.PAIR_MODE_IGNORE_UNPAIRED):
        read_pairs.append([read_1, read_2])
        assert read_1.name[: 19] == read_2.name[: 19]
    assert 2 == len(read_pairs)
Esempio n. 7
0
def test_read_pair_iterator_in_error_mode_xfail():

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-impaired.fa"))

    failed = True
    try:
        for _ in rparser.iter_read_pairs():
            pass
        failed = False
    except ValueError as exc:
        assert "Invalid read pair" in str(exc), str(exc)
    assert failed
Esempio n. 8
0
def test_badbzip2():
    rparser = ReadParser(utils.get_test_data("test-empty.fa.bz2"))
    try:
        for read in rparser:
            pass
        assert 0, "this should fail"
    except IOError, err:
        print str(err)
Esempio n. 9
0
def test_gzip_decompression():

    reads_count = 0
    rparser = ReadParser(utils.get_test_data("100-reads.fq.gz"))
    for read in rparser:
        reads_count += 1

    assert 100 == reads_count
Esempio n. 10
0
def test_bzip2_decompression():

    reads_count = 0
    rparser = ReadParser(utils.get_test_data("100-reads.fq.bz2"))
    for _ in rparser:
        reads_count += 1

    assert 100 == reads_count
Esempio n. 11
0
 def annotate_fasta():
     annotations = GFF3Parser(gff3_fn).read()
     with open(output_fn, 'w') as fp:
         for n, record in enumerate(ReadParser(transcriptome_fn)):
             df = annotations.query('seqid == "{0}"'.format(record.name))
             desc = generate_sequence_summary(record.name, record.sequence,
                                              df)
             fp.write('>{0}\n{1}\n'.format(desc.strip(), record.sequence))
Esempio n. 12
0
def test_with_zero_threads():
    N_THREADS = 0
    try:
        rparser = \
            ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS)
        assert 0, "should fail"
    except ValueError as e:
        assert str(e) == \
            'Invalid thread number, must be integer greater than zero.'
Esempio n. 13
0
def test_constructor():

    # Note: Using a data file with only one read.
    try:
        rparser = ReadParser(utils.get_test_data("single-read.fq"), "a")
        assert 0, ("ReadParser's constructor shouldn't accept a character for "
                   "the number of threads")
    except TypeError, err:
        print str(err)
Esempio n. 14
0
def test_bzip2_decompression_truncated():

    rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.bz2"))
    try:
        for read in rparser:
            pass
        assert 0, "this should fail"
    except IOError, err:
        print str(err)
Esempio n. 15
0
def test_error_badly_formatted_file():
    fname = utils.get_temp_filename('badly-formatted.fa')
    with open(fname, 'w') as f:
        f.write("not-sequence")

    with pytest.raises(OSError) as e:
        ReadParser(fname)

    assert e.match("contains badly formatted sequence")
def test_gzip_decompression_truncated():

    rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.gz"))
    try:
        for read in rparser:
            pass
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
Esempio n. 17
0
 def fix():
     names = []
     with open(output_fn, 'w') as fp:
         for record in ReadParser(transcriptome_fn):
             header = header_func(record.name)
             fp.write('>{0}\n{1}\n'.format(header, record.sequence))
             names.append((record.name, header))
     pd.DataFrame(names, columns=['original',
                                  'renamed']).to_csv(names_fn, index=False)
Esempio n. 18
0
def test_num_reads():
    """Test ReadParser.num_reads"""
    reads_count = 0
    rparser = ReadParser(utils.get_test_data("100-reads.fq.gz"))
    for _ in rparser:
        reads_count += 1

    assert reads_count == 100
    assert rparser.num_reads == 100
Esempio n. 19
0
def test_read_truncated():

    rparser = ReadParser(utils.get_test_data("truncated.fq"))
    try:
        for read in rparser:
            pass
        assert 0, "No exception raised on a truncated file"
    except IOError as err:
        assert "Sequence is empty" in str(err), str(err)
def test_badbzip2():
    try:
        rparser = ReadParser(utils.get_test_data("test-empty.fa.bz2"))
        for read in rparser:
            pass
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
Esempio n. 21
0
def test_num_reads_truncated():

    n_reads = 0
    rparser = ReadParser(utils.get_test_data("truncated.fq"))
    try:
        for read in rparser:
            n_reads += 1
    except IOError as err:
        assert "Sequence is empty" in str(err), str(err)
    assert rparser.num_reads == 1, "%d valid reads in file, got %d" % (
        n_reads, rparser.num_reads)
Esempio n. 22
0
def test_read_properties():

    # Note: Using a data file with only one read.
    rparser = ReadParser(utils.get_test_data("single-read.fq"))

    # Check the properties of all one reads in data set.
    for read in rparser:
        assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN"
        assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT"
        assert read.annotations == ""
        assert read.accuracy == """][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_"""
Esempio n. 23
0
def test_consume_seqfile_reads_parser(AnyTabletype):
    kh = AnyTabletype(5)
    rparser = ReadParser(utils.get_test_data('test-fastq-reads.fq'))

    kh.consume_seqfile(rparser)

    kh2 = AnyTabletype(5)
    for record in screed.open(utils.get_test_data('test-fastq-reads.fq')):
        kh2.consume(record.sequence)

    assert kh.get('CCGGC') == kh2.get('CCGGC')
Esempio n. 24
0
def test_read_properties_fa():

    # Note: Using a data file with only one read.
    rparser = ReadParser(utils.get_test_data("single-read.fa"))

    # Check the properties of all one reads in data set.
    for read in rparser:
        print(read.name)
        assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN"
        assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT"
        # if an attribute is empty it shouldn't exist
        assert not hasattr(read, 'quality')
Esempio n. 25
0
def test_read_properties():

    # Note: Using a data file with only one read.
    rparser = ReadParser(utils.get_test_data("single-read.fq"))

    # Check the properties of all one reads in data set.
    for read in rparser:
        assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN"
        assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT"
        # if an attribute is empty it shouldn't exist
        assert not hasattr(read, 'annotations')
        assert read.quality == """][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_"""
Esempio n. 26
0
def test_read_pair_iterator_in_error_mode():
    assert 0

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-paired.fa"))

    # If walks like an iterator and quacks like an iterator...
    rpi = rparser.iter_read_pairs()
    assert "__iter__" in dir(rpi)
    assert "next" in dir(rpi)

    # Are the alleged pairs actually pairs?
    read_pairs_1 = []
    for read_1, read_2 in rpi:
        read_pairs_1.append([read_1, read_2])
        assert read_1.name[:19] == read_2.name[:19]

    # Reload parser.
    # Note: No 'rewind' or 'reset' capability at the time of this writing.
    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-paired.fa"))

    # Ensure that error mode is the default mode.
    read_pairs_2 = []
    for read_1, read_2 \
            in rparser.iter_read_pairs(ReadParser.PAIR_MODE_ERROR_ON_UNPAIRED):
        read_pairs_2.append([read_1, read_2])
    matches = \
        map(
            lambda rp1, rp2: rp1[0].name == rp2[0].name,
            read_pairs_1, read_pairs_2
        )
    assert all(matches)  # Assert ALL the matches. :-]
Esempio n. 27
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
Esempio n. 28
0
def test_abund_dist_A_readparser(AnyTabletype):
    A_filename = utils.get_test_data('all-A.fa')
    rparser = ReadParser(A_filename)

    kh = AnyTabletype(4)
    tracking = Nodegraph(4, 1, 1, primes=PRIMES_1m)

    kh.consume_seqfile(A_filename)
    dist = kh.abundance_distribution(rparser, tracking)

    print(dist[:10])
    assert sum(dist) == 1
    assert dist[0] == 0
Esempio n. 29
0
def test_abund_dist_A_readparser(tabletype):
    A_filename = utils.get_test_data('all-A.fa')
    rparser = ReadParser(A_filename)

    kh = tabletype(4, PRIMES_1m)
    tracking = khmer._Nodetable(4, PRIMES_1m)

    kh.consume_seqfile(A_filename)
    dist = kh.abundance_distribution(A_filename, tracking)

    print(dist[:10])
    assert sum(dist) == 1
    assert dist[0] == 0
Esempio n. 30
0
def test_consume_absentfasta():
    nodegraph = khmer.Nodegraph(31, 1, 1)
    try:
        nodegraph.consume_seqfile()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        nodegraph.consume_seqfile(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
Esempio n. 31
0
 def parse(fn):
     hll = HLLCounter(.01, K)
     lens = []
     names = []
     gc_len = 0
     for contig in ReadParser(fn):
         lens.append(len(contig.sequence))
         names.append(contig.name)
         hll.consume_string(contig.sequence)
         gc_len += contig.sequence.count('C')
         gc_len += contig.sequence.count('G')
     S = pd.Series(lens, index=names)
     S.sort()
     gc_perc = float(gc_len) / S.sum()
     return S, hll.estimate_cardinality(), gc_perc
Esempio n. 32
0
def test_with_default_arguments():

    read_names = []
    # Note: Using a data file where read names are just integers on [0,99).
    rparser = ReadParser(utils.get_test_data("random-20-a.fa"))

    for read in rparser:
        read_names.append(int(read.name))

    # "Derandomize".
    read_names.sort()

    # Each read number should match the corresponding name.
    for m, n in enumerate(read_names):
        assert m == n
def test_read_cleaning_output_partitions(Graphtype):
    infile = utils.get_test_data('valid-read-testing.fq')
    savepath = utils.get_temp_filename('foo')

    # read this in using "approved good" behavior w/cleaned_seq
    x = Graphtype(8, PRIMES_1m)
    for read in ReadParser(infile):
        x.consume(read.cleaned_seq)  # consume cleaned_seq

    kmer = 'caggcgcc'.upper()
    x.add_tag(kmer)
    x.set_partition_id(kmer, 1)

    kmer = 'ACTGGGCG'
    x.add_tag(kmer)
    x.set_partition_id(kmer, 2)

    kmer = 'CCGGCGTG'
    x.add_tag(kmer)
    x.set_partition_id(kmer, 3)

    x.output_partitions(infile, savepath)

    read_names = [read.name for read in ReadParser(savepath)]
    print(read_names)
    assert len(read_names) == 4

    print(read_names)
    assert '895:1:1:1246:14654 1:N:0:NNNNN\t1\t1' in read_names
    assert '895:1:1:1248:9583 1:N:0:NNNNN\t2\t2' in read_names
    assert '895:1:1:1252:19493 1:N:0:NNNNN\t3\t3' in read_names

    assert 'lowercase_to_uppercase\t5\t1' in read_names

    assert 'n_in_read\t6\t2' not in read_names
    assert 'zy_in_read\t7\t3' not in read_names
Esempio n. 34
0
def test_consume_absentfasta_with_reads_parser():
    presencetable = khmer._Hashbits(31, [1])
    try:
        presencetable.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        presencetable.consume_fasta_with_reads_parser(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
Esempio n. 35
0
def test_consume_absentfasta_with_reads_parser():
    countingtable = khmer.new_counting_hash(4, 4**4, 4)
    try:
        countingtable.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print str(err)
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        countingtable.consume_fasta_with_reads_parser(readparser)
        assert 0, "this should fail"
    except IOError as err:
        print str(err)
    except ValueError as err:
        print str(err)
Esempio n. 36
0
def test_consume_absentfasta_with_reads_parser():
    countgraph = khmer.Countgraph(4, 4**4, 4)
    try:
        countgraph.consume_seqfile_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        countgraph.consume_seqfile_with_reads_parser(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
Esempio n. 37
0
def test_num_reads_threads():
    """Test threadsaftey of ReadParser's read counting"""
    import threading

    def count_reads(rparser):
        for _ in rparser:
            pass

    n_threads = 4
    threads = []
    rparser = ReadParser(utils.get_test_data("100-reads.fq.gz"))
    for _ in range(n_threads):
        thr = threading.Thread(target=count_reads, args=[rparser, ])
        threads.append(thr)
        thr.start()
    for thr in threads:
        thr.join()

    assert rparser.num_reads == 100
Esempio n. 38
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    graphtype = 'countgraph' if not args.small_count else 'smallcountgraph'
    report_on_config(args, graphtype=graphtype)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, graphtype)
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        if args.small_count:
            ct = SmallCountgraph.load(args.loadgraph)
        else:
            ct = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info('created temporary directory {temp};\n'
             'use -T to change location', temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        paired_iter = broken_paired_reader(ReadParser(filename), min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info("... {filename} {n_saved} {n_reads} {n_bp} "
                         "{w_reads} {w_bp}", filename=filename,
                         n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
                         n_bp=trimmer.n_bp, w_reads=written_reads,
                         w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename, kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        read_parser = ReadParser(pass2filename)
        paired_iter = broken_paired_reader(read_parser,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename, c=trimmer.n_saved,
                         d=trimmer.n_reads, e=trimmer.n_bp,
                         f=written_reads, g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        read_parser.close()

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    try:
        log_info('removing temp directory & contents ({temp})', temp=tempdir)
        shutil.rmtree(tempdir)
    except OSError as oe:
        log_info('WARNING: unable to remove {temp} (probably an NFS issue); '
                 'please remove manually', temp=tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total, np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads, t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped, p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped, bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.output is None:
        log_info('output in *.abundtrim')
    elif args.output.name == 1:
        log_info('output streamed to stdout')
    elif args.output.name:
        log_info('output in {}'.format(args.output.name))

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)

    if args.summary_info is not None:
        # note that when streaming to stdout the name of args.output will
        # be set to 1
        if args.output is not None and args.output.name != 1:
            base = args.output.name
        # no explicit name or stdout stream -> use a default name
        else:
            base = 'trim-low-abund-{}'.format(
                time.strftime("%Y-%m-%dT%H:%M:%S"))

        info = {'fpr': fp_rate,
                'reads': n_reads,
                'basepairs': n_bp,
                'reads_written': written_reads,
                'basepairs_written': written_bp,
                'reads_skipped': n_skipped,
                'basepairs_skipped': bp_skipped,
                'reads_removed': n_reads - written_reads,
                'reads_trimmed': trimmed_reads,
                'basepairs_removed_or_trimmed': n_bp - written_bp
                }
        store_provenance_info(info, fname=base, format=args.summary_info)
Esempio n. 39
0
def reads():
    infile = utils.get_test_data('valid-read-testing.fq')
    reads = ReadParser(infile)
    yield reads
    reads.close()
Esempio n. 40
0
def test_iterator_identities():

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-paired.fa"))
    assert rparser is rparser.__iter__()
    assert rparser is rparser.iter_reads()