def test_read_pair_iterator_in_error_mode(): assert 0 rparser = \ ReadParser(utils.get_test_data("test-abund-read-paired.fa")) # If walks like an iterator and quacks like an iterator... rpi = rparser.iter_read_pairs() assert "__iter__" in dir(rpi) assert "next" in dir(rpi) # Are the alleged pairs actually pairs? read_pairs_1 = [] for read_1, read_2 in rpi: read_pairs_1.append([read_1, read_2]) assert read_1.name[: 19] == read_2.name[: 19] # Reload parser. # Note: No 'rewind' or 'reset' capability at the time of this writing. rparser = \ ReadParser(utils.get_test_data("test-abund-read-paired.fa")) # Ensure that error mode is the default mode. read_pairs_2 = [] for read_1, read_2 \ in rparser.iter_read_pairs(ReadParser.PAIR_MODE_ERROR_ON_UNPAIRED): read_pairs_2.append([read_1, read_2]) matches = \ map( lambda rp1, rp2: rp1[0].name == rp2[0].name, read_pairs_1, read_pairs_2 ) assert all(matches) # Assert ALL the matches. :-]
def test_iternext(): rparser = ReadParser(utils.get_test_data("fakelump.fa.stoptags.txt")) read_pairs = [] try: for read_1, read_2 in rparser.iter_read_pairs(): read_pairs.append(read_1, read_2) assert 0, "Shouldn't be able to iterate over non FASTA file" except IOError, err: print str(err)
def test_bzip2_decompression_truncated_pairiter(): rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.bz2")) try: for read in rparser.iter_read_pairs(): pass assert 0, "this should fail" except IOError as err: print str(err)
def test_read_pair_iterator_in_error_mode_xfail( ): rparser = \ ReadParser( utils.get_test_data( "test-abund-read-impaired.fa" ) ) failed = True try: for rpair in rparser.iter_read_pairs( ): pass failed = False except IOError as exc: pass assert failed
def test_gzip_decompression_truncated_pairiter(): rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.gz")) try: for _ in rparser.iter_read_pairs(): pass assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_read_pair_iterator_in_ignore_mode(): assert 0 rparser = \ ReadParser(utils.get_test_data("test-abund-read-impaired.fa")) read_pairs = [] for read_1, read_2 \ in rparser.iter_read_pairs(ReadParser.PAIR_MODE_IGNORE_UNPAIRED): read_pairs.append([read_1, read_2]) assert read_1.name[: 19] == read_2.name[: 19] assert 2 == len(read_pairs)
def test_read_pair_iterator_in_error_mode_xfail(): rparser = \ ReadParser(utils.get_test_data("test-abund-read-impaired.fa")) failed = True try: for _ in rparser.iter_read_pairs(): pass failed = False except ValueError as exc: assert "Invalid read pair" in str(exc), str(exc) assert failed
def test_badbzip2(): rparser = ReadParser(utils.get_test_data("test-empty.fa.bz2")) try: for read in rparser: pass assert 0, "this should fail" except IOError, err: print str(err)
def test_gzip_decompression(): reads_count = 0 rparser = ReadParser(utils.get_test_data("100-reads.fq.gz")) for read in rparser: reads_count += 1 assert 100 == reads_count
def test_bzip2_decompression(): reads_count = 0 rparser = ReadParser(utils.get_test_data("100-reads.fq.bz2")) for _ in rparser: reads_count += 1 assert 100 == reads_count
def annotate_fasta(): annotations = GFF3Parser(gff3_fn).read() with open(output_fn, 'w') as fp: for n, record in enumerate(ReadParser(transcriptome_fn)): df = annotations.query('seqid == "{0}"'.format(record.name)) desc = generate_sequence_summary(record.name, record.sequence, df) fp.write('>{0}\n{1}\n'.format(desc.strip(), record.sequence))
def test_with_zero_threads(): N_THREADS = 0 try: rparser = \ ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS) assert 0, "should fail" except ValueError as e: assert str(e) == \ 'Invalid thread number, must be integer greater than zero.'
def test_constructor(): # Note: Using a data file with only one read. try: rparser = ReadParser(utils.get_test_data("single-read.fq"), "a") assert 0, ("ReadParser's constructor shouldn't accept a character for " "the number of threads") except TypeError, err: print str(err)
def test_bzip2_decompression_truncated(): rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.bz2")) try: for read in rparser: pass assert 0, "this should fail" except IOError, err: print str(err)
def test_error_badly_formatted_file(): fname = utils.get_temp_filename('badly-formatted.fa') with open(fname, 'w') as f: f.write("not-sequence") with pytest.raises(OSError) as e: ReadParser(fname) assert e.match("contains badly formatted sequence")
def test_gzip_decompression_truncated(): rparser = ReadParser(utils.get_test_data("100-reads.fq.truncated.gz")) try: for read in rparser: pass assert 0, "this should fail" except OSError as err: print(str(err))
def fix(): names = [] with open(output_fn, 'w') as fp: for record in ReadParser(transcriptome_fn): header = header_func(record.name) fp.write('>{0}\n{1}\n'.format(header, record.sequence)) names.append((record.name, header)) pd.DataFrame(names, columns=['original', 'renamed']).to_csv(names_fn, index=False)
def test_num_reads(): """Test ReadParser.num_reads""" reads_count = 0 rparser = ReadParser(utils.get_test_data("100-reads.fq.gz")) for _ in rparser: reads_count += 1 assert reads_count == 100 assert rparser.num_reads == 100
def test_read_truncated(): rparser = ReadParser(utils.get_test_data("truncated.fq")) try: for read in rparser: pass assert 0, "No exception raised on a truncated file" except IOError as err: assert "Sequence is empty" in str(err), str(err)
def test_badbzip2(): try: rparser = ReadParser(utils.get_test_data("test-empty.fa.bz2")) for read in rparser: pass assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_num_reads_truncated(): n_reads = 0 rparser = ReadParser(utils.get_test_data("truncated.fq")) try: for read in rparser: n_reads += 1 except IOError as err: assert "Sequence is empty" in str(err), str(err) assert rparser.num_reads == 1, "%d valid reads in file, got %d" % ( n_reads, rparser.num_reads)
def test_read_properties(): # Note: Using a data file with only one read. rparser = ReadParser(utils.get_test_data("single-read.fq")) # Check the properties of all one reads in data set. for read in rparser: assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN" assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT" assert read.annotations == "" assert read.accuracy == """][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_"""
def test_consume_seqfile_reads_parser(AnyTabletype): kh = AnyTabletype(5) rparser = ReadParser(utils.get_test_data('test-fastq-reads.fq')) kh.consume_seqfile(rparser) kh2 = AnyTabletype(5) for record in screed.open(utils.get_test_data('test-fastq-reads.fq')): kh2.consume(record.sequence) assert kh.get('CCGGC') == kh2.get('CCGGC')
def test_read_properties_fa(): # Note: Using a data file with only one read. rparser = ReadParser(utils.get_test_data("single-read.fa")) # Check the properties of all one reads in data set. for read in rparser: print(read.name) assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN" assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT" # if an attribute is empty it shouldn't exist assert not hasattr(read, 'quality')
def test_read_properties(): # Note: Using a data file with only one read. rparser = ReadParser(utils.get_test_data("single-read.fq")) # Check the properties of all one reads in data set. for read in rparser: assert read.name == "895:1:1:1246:14654 1:N:0:NNNNN" assert read.sequence == "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT" # if an attribute is empty it shouldn't exist assert not hasattr(read, 'annotations') assert read.quality == """][aaX__aa[`ZUZ[NONNFNNNNNO_____^RQ_"""
def test_read_pair_iterator_in_error_mode(): assert 0 rparser = \ ReadParser(utils.get_test_data("test-abund-read-paired.fa")) # If walks like an iterator and quacks like an iterator... rpi = rparser.iter_read_pairs() assert "__iter__" in dir(rpi) assert "next" in dir(rpi) # Are the alleged pairs actually pairs? read_pairs_1 = [] for read_1, read_2 in rpi: read_pairs_1.append([read_1, read_2]) assert read_1.name[:19] == read_2.name[:19] # Reload parser. # Note: No 'rewind' or 'reset' capability at the time of this writing. rparser = \ ReadParser(utils.get_test_data("test-abund-read-paired.fa")) # Ensure that error mode is the default mode. read_pairs_2 = [] for read_1, read_2 \ in rparser.iter_read_pairs(ReadParser.PAIR_MODE_ERROR_ON_UNPAIRED): read_pairs_2.append([read_1, read_2]) matches = \ map( lambda rp1, rp2: rp1[0].name == rp2[0].name, read_pairs_1, read_pairs_2 ) assert all(matches) # Assert ALL the matches. :-]
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def test_abund_dist_A_readparser(AnyTabletype): A_filename = utils.get_test_data('all-A.fa') rparser = ReadParser(A_filename) kh = AnyTabletype(4) tracking = Nodegraph(4, 1, 1, primes=PRIMES_1m) kh.consume_seqfile(A_filename) dist = kh.abundance_distribution(rparser, tracking) print(dist[:10]) assert sum(dist) == 1 assert dist[0] == 0
def test_abund_dist_A_readparser(tabletype): A_filename = utils.get_test_data('all-A.fa') rparser = ReadParser(A_filename) kh = tabletype(4, PRIMES_1m) tracking = khmer._Nodetable(4, PRIMES_1m) kh.consume_seqfile(A_filename) dist = kh.abundance_distribution(A_filename, tracking) print(dist[:10]) assert sum(dist) == 1 assert dist[0] == 0
def test_consume_absentfasta(): nodegraph = khmer.Nodegraph(31, 1, 1) try: nodegraph.consume_seqfile() assert 0, "this should fail" except TypeError as err: print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) nodegraph.consume_seqfile(readparser) assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def parse(fn): hll = HLLCounter(.01, K) lens = [] names = [] gc_len = 0 for contig in ReadParser(fn): lens.append(len(contig.sequence)) names.append(contig.name) hll.consume_string(contig.sequence) gc_len += contig.sequence.count('C') gc_len += contig.sequence.count('G') S = pd.Series(lens, index=names) S.sort() gc_perc = float(gc_len) / S.sum() return S, hll.estimate_cardinality(), gc_perc
def test_with_default_arguments(): read_names = [] # Note: Using a data file where read names are just integers on [0,99). rparser = ReadParser(utils.get_test_data("random-20-a.fa")) for read in rparser: read_names.append(int(read.name)) # "Derandomize". read_names.sort() # Each read number should match the corresponding name. for m, n in enumerate(read_names): assert m == n
def test_read_cleaning_output_partitions(Graphtype): infile = utils.get_test_data('valid-read-testing.fq') savepath = utils.get_temp_filename('foo') # read this in using "approved good" behavior w/cleaned_seq x = Graphtype(8, PRIMES_1m) for read in ReadParser(infile): x.consume(read.cleaned_seq) # consume cleaned_seq kmer = 'caggcgcc'.upper() x.add_tag(kmer) x.set_partition_id(kmer, 1) kmer = 'ACTGGGCG' x.add_tag(kmer) x.set_partition_id(kmer, 2) kmer = 'CCGGCGTG' x.add_tag(kmer) x.set_partition_id(kmer, 3) x.output_partitions(infile, savepath) read_names = [read.name for read in ReadParser(savepath)] print(read_names) assert len(read_names) == 4 print(read_names) assert '895:1:1:1246:14654 1:N:0:NNNNN\t1\t1' in read_names assert '895:1:1:1248:9583 1:N:0:NNNNN\t2\t2' in read_names assert '895:1:1:1252:19493 1:N:0:NNNNN\t3\t3' in read_names assert 'lowercase_to_uppercase\t5\t1' in read_names assert 'n_in_read\t6\t2' not in read_names assert 'zy_in_read\t7\t3' not in read_names
def test_consume_absentfasta_with_reads_parser(): presencetable = khmer._Hashbits(31, [1]) try: presencetable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError as err: print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) presencetable.consume_fasta_with_reads_parser(readparser) assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_consume_absentfasta_with_reads_parser(): countingtable = khmer.new_counting_hash(4, 4**4, 4) try: countingtable.consume_fasta_with_reads_parser() assert 0, "this should fail" except TypeError as err: print str(err) try: readparser = ReadParser(utils.get_test_data('empty-file')) countingtable.consume_fasta_with_reads_parser(readparser) assert 0, "this should fail" except IOError as err: print str(err) except ValueError as err: print str(err)
def test_consume_absentfasta_with_reads_parser(): countgraph = khmer.Countgraph(4, 4**4, 4) try: countgraph.consume_seqfile_with_reads_parser() assert 0, "this should fail" except TypeError as err: print(str(err)) try: readparser = ReadParser(utils.get_test_data('empty-file')) countgraph.consume_seqfile_with_reads_parser(readparser) assert 0, "this should fail" except OSError as err: print(str(err)) except ValueError as err: print(str(err))
def test_num_reads_threads(): """Test threadsaftey of ReadParser's read counting""" import threading def count_reads(rparser): for _ in rparser: pass n_threads = 4 threads = [] rparser = ReadParser(utils.get_test_data("100-reads.fq.gz")) for _ in range(n_threads): thr = threading.Thread(target=count_reads, args=[rparser, ]) threads.append(thr) thr.start() for thr in threads: thr.join() assert rparser.num_reads == 100
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### graphtype = 'countgraph' if not args.small_count else 'smallcountgraph' report_on_config(args, graphtype=graphtype) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, graphtype) check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) if args.small_count: ct = SmallCountgraph.load(args.loadgraph) else: ct = Countgraph.load(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info('created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. paired_iter = broken_paired_reader(ReadParser(filename), min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info("... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. read_parser = ReadParser(pass2filename) paired_iter = broken_paired_reader(read_parser, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) read_parser.close() log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() try: log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) except OSError as oe: log_info('WARNING: unable to remove {temp} (probably an NFS issue); ' 'please remove manually', temp=tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.output is None: log_info('output in *.abundtrim') elif args.output.name == 1: log_info('output streamed to stdout') elif args.output.name: log_info('output in {}'.format(args.output.name)) if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph) if args.summary_info is not None: # note that when streaming to stdout the name of args.output will # be set to 1 if args.output is not None and args.output.name != 1: base = args.output.name # no explicit name or stdout stream -> use a default name else: base = 'trim-low-abund-{}'.format( time.strftime("%Y-%m-%dT%H:%M:%S")) info = {'fpr': fp_rate, 'reads': n_reads, 'basepairs': n_bp, 'reads_written': written_reads, 'basepairs_written': written_bp, 'reads_skipped': n_skipped, 'basepairs_skipped': bp_skipped, 'reads_removed': n_reads - written_reads, 'reads_trimmed': trimmed_reads, 'basepairs_removed_or_trimmed': n_bp - written_bp } store_provenance_info(info, fname=base, format=args.summary_info)
def reads(): infile = utils.get_test_data('valid-read-testing.fq') reads = ReadParser(infile) yield reads reads.close()
def test_iterator_identities(): rparser = \ ReadParser(utils.get_test_data("test-abund-read-paired.fa")) assert rparser is rparser.__iter__() assert rparser is rparser.iter_reads()