def test_normalize_by_median_dumpfrequency(): CUTOFF = '1' infiles = [utils.get_temp_filename('test-0.fq')] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append( utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('normalize-by-median.py') args = ['-d', '2', '-C', CUTOFF, '-k', '17'] args.extend(infiles) (status, out, err) = utils.runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct')) test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, 'backup.ct')) assert out.count('Backup: Saving') == 2 assert 'Nothing' in out
def test_split_paired_reads_1_fa(): # test input file infile = utils.get_test_data("paired.fa") ex_outfile1 = utils.get_test_data("paired.fa.1") ex_outfile2 = utils.get_test_data("paired.fa.2") # actual output files... outfile1 = utils.get_temp_filename("paired.fa.1") in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename("paired.fa.2", in_dir) script = scriptpath("split-paired-reads.py") args = [infile] runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_filter_stoptags(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) stopfile = utils.get_temp_filename('stoptags', in_dir) # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir. shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) # now, create a file with some stop tags in it -- K = 18 kh = khmer.new_hashbits(K, 1, 1) kh.add_stop_tag('GTTGACGGGGCTCAGGGG') kh.save_stop_tags(stopfile) del kh # finally, run filter-stoptags. script = scriptpath('filter-stoptags.py') args = ['-k', str(K), stopfile, infile, infile] utils.runscript(script, args, in_dir) # verify that the basic output file exists outfile = infile + '.stopfilt' assert os.path.exists(outfile), outfile # it should contain only one unique sequence, because we've trimmed # off everything after the beginning of the only long sequence in there. seqs = set([r.sequence for r in screed.open(outfile)]) assert len(seqs) == 1, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
def test_normalize_by_median_force(): CUTOFF = '1' corrupt_infile = utils.get_temp_filename('test-corrupt.fq') good_infile = utils.get_temp_filename( 'test-good.fq', tempdir=os.path.dirname(corrupt_infile)) in_dir = os.path.dirname(good_infile) shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile) script = scriptpath('normalize-by-median.py') args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile] (status, out, err) = utils.runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed') test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(corrupt_infile + '.ct.failed') assert '*** Skipping' in err assert '** IOErrors' in err
def test_extract_long_sequences(): script = scriptpath('extract-long-sequences.py') fq_infile = utils.get_temp_filename('test.fq') fa_infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile) shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile) fq_outfile = fq_infile + '.keep.fq' fa_outfile = fa_infile + '.keep.fa' in_dir_fq = os.path.dirname(fq_infile) in_dir_fa = os.path.dirname(fa_infile) args = [fq_infile, '-l', '10', '-o', 'fq_outfile'] (status, out, err) = utils.runscript(script, args, in_dir_fa) countlines = sum(1 for line in open(fq_infile)) assert countlines == 44, countlines args = [fa_infile, '-l', '10', '-o', 'fa_outfile'] (status, out, err) = utils.runscript(script, args, in_dir_fa) countlines = sum(1 for line in open(fa_infile)) assert countlines == 22, countlines
def test_count_overlap(): seqfile1 = utils.get_temp_filename('test-overlap1.fa') in_dir = os.path.dirname(seqfile1) seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir) outfile = utils.get_temp_filename('overlap.out', in_dir) curvefile = utils.get_temp_filename('overlap.out.curve', in_dir) shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1) shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2) htfile = _make_graph(seqfile1, ksize=20) script = scriptpath('count-overlap.py') args = [ '--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000', htfile + '.pt', seqfile2, outfile ] (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0 assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) assert '# of unique k-mers in dataset2: 759047' in data assert '# of overlap unique k-mers: 245621' in data assert os.path.exists(curvefile), curvefile data = [x.strip() for x in open(curvefile)] data = set(data) assert '178633 1155' in data assert '496285 2970' in data assert '752053 238627' in data
def test_split_paired_reads_2_fq(): # test input file infile = utils.get_test_data('paired.fq') ex_outfile1 = utils.get_test_data('paired.fq.1') ex_outfile2 = utils.get_test_data('paired.fq.2') # actual output files... outfile1 = utils.get_temp_filename('paired.fq.1') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired.fq.2', in_dir) script = scriptpath('split-paired-reads.py') args = [infile] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0
def test_extract_paired_reads_1_fa(): # test input file infile = utils.get_test_data('paired-mixed.fa') ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fa.se') # actual output files... outfile1 = utils.get_temp_filename('paired-mixed.fa.pe') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir) script = scriptpath('extract-paired-reads.py') args = [infile] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_make_initial_stoptags(): # gen input files using load-graph.py -t # should keep test_data directory size down # or something like that # this assumes (obv.) load-graph works properly bzinfile = utils.get_temp_filename('test-reads.fq.bz2') shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile) in_dir = os.path.dirname(bzinfile) genscript = scriptpath('load-graph.py') genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2'] utils.runscript(genscript, genscriptargs, in_dir) # test input file gen'd by load-graphs infile = utils.get_temp_filename('test-reads.pt') infile2 = utils.get_temp_filename('test-reads.tagset', in_dir) # get file to compare against ex_outfile = utils.get_test_data('test-reads.stoptags') # actual output file outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir) script = scriptpath('make-initial-stoptags.py') # make-initial-stoptags has weird file argument syntax # read the code before modifying args = ['test-reads'] utils.runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1
def test_normalize_by_median_dumpfrequency(): CUTOFF = "1" infiles = [utils.get_temp_filename("test-0.fq")] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append(utils.get_temp_filename("test-{x}.fq".format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data("test-fastq-reads.fq"), infile) script = scriptpath("normalize-by-median.py") args = ["-d", "2", "-C", CUTOFF, "-k", "17"] args.extend(infiles) (status, out, err) = runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, "backup.ht")) test_good_read = "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT" test_good_read2 = "TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA" assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, "backup.ht")) assert out.count("Backup: Saving") == 2 assert "Nothing" in out
def test_save_merge_from_disk(self): ht = khmer.new_hashbits(20, 4 ** 4 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print divvy (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_save_load_merge(self): raise Exception("this test coredumps on lyorn. Disabled.") ht = khmer.new_hashbits(20, 4**14+1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print divvy (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y a = ht.load_subset_partitionmap(outfile1) b = ht.load_subset_partitionmap(outfile2) ht.merge_subset(a) ht.merge_subset(b) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_abund_dist_gz_bigcount(): infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) outfile = utils.get_temp_filename('test_ct.gz') script = scriptpath('load-into-counting.py') htfile = utils.get_temp_filename('test_ct') args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile] utils.runscript(script, args) # create a bigcount table assert os.path.exists(htfile) data = open(htfile, 'rb').read() f_out = gzip.open(outfile, 'wb') # compress the created bigcount table f_out.write(data) f_out.close() # load the compressed bigcount table counting_hash = khmer.load_counting_hash(outfile) hashsizes = counting_hash.hashsizes() kmer_size = counting_hash.ksize() tracking = khmer._Hashbits(kmer_size, hashsizes) abundances = counting_hash.abundance_distribution(infile, tracking) # calculate abundance distribution for compressed bigcount table flag = False # check if abundance is > 255 # if ok gzipped bigcount was loaded correctly for _, i in enumerate(abundances): print _, i if _ > 255 and i > 0: flag = True break assert flag
def test_save_merge_from_disk_2(self): raise Exception("this test coredumps on lyorn. Disabled.") ht = khmer.new_hashbits(20, 4**14 + 1) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) subset_size = total_reads / 2 + total_reads % 2 divvy = ht.divide_tags_into_subsets(subset_size) outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(divvy[0], divvy[1]) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(divvy[1], 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_save_load_corrupted(): lb_pre = LabelHash(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_fasta_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old LabelHash del lb_pre lb = LabelHash(20, 1e7, 4) # produce all possible truncated versions of this file data = open(savepath, 'rb').read() for i in range(len(data)): truncated = utils.get_temp_filename('trunc.labels') fp = open(truncated, 'wb') fp.write(data[:i]) fp.close() try: lb.load_labels_and_tags(truncated) assert 0, "this should not succeed -- truncated file len %d" % (i,) except IOError as err: print 'expected failure for', i, ': ', str(err)
def test_save_merge_from_disk(self): raise Exception("this test coredumps on lyorn. Disabled.") ht = khmer.new_hashbits(20, 4**14 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print divvy (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_random_20_a_succ_IV_save(self): ht = khmer.new_hashbits(20, 4**13 + 1) filename = utils.get_test_data('random-20-a.fa') savefile_ht = utils.get_temp_filename('ht') savefile_tags = utils.get_temp_filename('tags') outfile = filename + utils.get_temp_filename('out') total_reads, _ = ht.consume_fasta_and_tag(filename) ht.save(savefile_ht) ht.save_tagset(savefile_tags) del ht ht = khmer.new_hashbits(20, 4**13 + 1) ht.load(savefile_ht) ht.load_tagset(savefile_tags) divvy = ht.divide_tags_into_subsets(1) divvy.append(0) subsets = [] for i in range(len(divvy) - 1): x = ht.do_subset_partition(divvy[i], divvy[i + 1]) subsets.append(x) for x in reversed(subsets): ht.merge_subset(x) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions
def test_save_load_merge_2(self): ht = khmer.new_hashbits(20, 4**8 + 1) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) subset_size = total_reads // 2 + total_reads % 2 divvy = ht.divide_tags_into_subsets(subset_size) outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(divvy[0], divvy[1]) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(divvy[1], 0) ht.save_subset_partitionmap(y, outfile2) del y assert os.path.exists(outfile1) assert os.path.exists(outfile2) a = ht.load_subset_partitionmap(outfile1) b = ht.load_subset_partitionmap(outfile2) ht.merge_subset(a) ht.merge_subset(b) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_load_gz(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave1.ht') loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz') sizes = list(PRIMES_1m) sizes.append(1000005) # save uncompressed hashtable. hi = khmer._new_counting_hash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) # compress. in_file = open(savepath, 'rb') out_file = gzip.open(loadpath, 'wb') out_file.writelines(in_file) out_file.close() in_file.close() # load compressed hashtable. ht = khmer._new_counting_hash(12, sizes) ht.load(loadpath) tracking = khmer._new_hashbits(12, sizes) x = hi.abundance_distribution(inpath, tracking) tracking = khmer._new_hashbits(12, sizes) y = ht.abundance_distribution(inpath, tracking) assert sum(x) == 3966, sum(x) assert x == y, (x, y)
def test_save_load_merge_on_graph(): ht = khmer.new_hashbits(20, 4 ** 4 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print divvy assert len(divvy) is 3 (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y a = ht.load_partitionmap(outfile1) # <-- this is different b = ht.load_subset_partitionmap(outfile2) ht.merge_subset(b) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_normalize_by_median_dumpfrequency(): CUTOFF = '1' infiles = [utils.get_temp_filename('test-0.fq')] in_dir = os.path.dirname(infiles[0]) for x in range(1, 5): infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir)) for infile in infiles: shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = scriptpath('normalize-by-median.py') args = ['-d', '2', '-C', CUTOFF, '-k', '17'] args.extend(infiles) (status, out, err) = runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct')) test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(os.path.join(in_dir, 'backup.ct')) assert out.count('Backup: Saving') == 2 assert 'Nothing' in out
def test_extract_paired_reads_1_fa(): # test input file infile = utils.get_test_data('paired-mixed.fa') ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe') ex_outfile2 = utils.get_test_data('paired-mixed.fa.se') # actual output files... outfile1 = utils.get_temp_filename('paired-mixed.fa.pe') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir) script = scriptpath('extract-paired-reads.py') args = [infile] runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert n > 0
def test_random_20_a_succ_IV_save(self): ht = khmer.new_hashbits(20, 4 ** 7 + 1) filename = utils.get_test_data('random-20-a.fa') savefile_ht = utils.get_temp_filename('ht') savefile_tags = utils.get_temp_filename('tags') outfile = filename + utils.get_temp_filename('out') total_reads, _ = ht.consume_fasta_and_tag(filename) ht.save(savefile_ht) ht.save_tagset(savefile_tags) del ht ht = khmer.new_hashbits(20, 4 ** 7 + 1) ht.load(savefile_ht) ht.load_tagset(savefile_tags) divvy = ht.divide_tags_into_subsets(1) divvy.append(0) subsets = [] for i in range(len(divvy) - 1): x = ht.do_subset_partition(divvy[i], divvy[i + 1]) subsets.append(x) for x in reversed(subsets): ht.merge_subset(x) n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions
def test_save_load_merge(self): ht = khmer.new_hashbits(20, 4**4 + 1) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) assert total_reads == 3, total_reads divvy = ht.divide_tags_into_subsets(1) print divvy assert len(divvy) is 3 (a, b, c) = divvy outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(a, b) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(b, 0) ht.save_subset_partitionmap(y, outfile2) del y a = ht.load_subset_partitionmap(outfile1) b = ht.load_subset_partitionmap(outfile2) ht.merge_subset(a) ht.merge_subset(b) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_filter_stoptags(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) stopfile = utils.get_temp_filename('stoptags', in_dir) # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir. shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) # now, create a file with some stop tags in it -- K = 18 kh = khmer.new_hashbits(K, 1, 1) kh.add_stop_tag('GTTGACGGGGCTCAGGGG') kh.save_stop_tags(stopfile) del kh # finally, run filter-stoptags. script = scriptpath('filter-stoptags.py') args = ['-k', str(K), stopfile, infile, infile] (status, out, err) = runscript(script, args, in_dir) print out print err assert status == 0 # verify that the basic output file exists outfile = infile + '.stopfilt' assert os.path.exists(outfile), outfile # it should contain only one unique sequence, because we've trimmed # off everything after the beginning of the only long sequence in there. seqs = set([ r.sequence for r in screed.open(outfile) ]) assert len(seqs) == 1, seqs assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
def test_save_load_merge_2(self): raise Exception("this test coredumps on lyorn. Disabled.") ht = khmer.new_hashbits(20, 4**14+1) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) subset_size = total_reads / 2 + total_reads % 2; divvy = ht.divide_tags_into_subsets(subset_size) outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(divvy[0], divvy[1]) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(divvy[1], 0) ht.save_subset_partitionmap(y, outfile2) del y a = ht.load_subset_partitionmap(outfile1) b = ht.load_subset_partitionmap(outfile2) ht.merge_subset(a) ht.merge_subset(b) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_count_overlap(): seqfile1 = utils.get_temp_filename('test-overlap1.fa') in_dir = os.path.dirname(seqfile1) seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir) outfile = utils.get_temp_filename('overlap.out', in_dir) curvefile = utils.get_temp_filename('overlap.out.curve', in_dir) shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1) shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2) htfile = _make_graph(seqfile1, ksize=20) script = scriptpath('count-overlap.py') args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000', htfile + '.pt', seqfile2, outfile] (status, out, err) = runscript(script, args, in_dir) assert status == 0 assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) assert '# of unique k-mers in dataset2: 759047' in data assert '# of overlap unique k-mers: 245621' in data assert os.path.exists(curvefile), curvefile data = [x.strip() for x in open(curvefile)] data = set(data) assert '178633 1155' in data assert '496285 2970' in data assert '752053 238627' in data
def test_split_paired_reads_2_fq(): # test input file infile = utils.get_test_data('paired.fq') ex_outfile1 = utils.get_test_data('paired.fq.1') ex_outfile2 = utils.get_test_data('paired.fq.2') # actual output files... outfile1 = utils.get_temp_filename('paired.fq.1') in_dir = os.path.dirname(outfile1) outfile2 = utils.get_temp_filename('paired.fq.2', in_dir) script = scriptpath('split-paired-reads.py') args = [infile] runscript(script, args, in_dir) assert os.path.exists(outfile1), outfile1 assert os.path.exists(outfile2), outfile2 n = 0 for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0 n = 0 for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)): n += 1 assert r.name == q.name assert r.sequence == q.sequence assert r.accuracy == q.accuracy assert n > 0
def test_save_merge_from_disk_2(self): ht = khmer.new_hashbits(20, 4 ** 7 + 1) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) subset_size = total_reads // 2 + total_reads % 2 divvy = ht.divide_tags_into_subsets(subset_size) outfile1 = utils.get_temp_filename('x.pmap') outfile2 = utils.get_temp_filename('y.pmap') x = ht.do_subset_partition(divvy[0], divvy[1]) ht.save_subset_partitionmap(x, outfile1) del x y = ht.do_subset_partition(divvy[1], 0) ht.save_subset_partitionmap(y, outfile2) del y ht.merge_subset_from_disk(outfile1) ht.merge_subset_from_disk(outfile2) outfile = utils.get_temp_filename('out.part') n_partitions = ht.output_partitions(filename, outfile) assert n_partitions == 1, n_partitions # combined.
def test_extract_long_sequences(): script = scriptpath('extract-long-sequences.py') fq_infile = utils.get_temp_filename('test.fq') fa_infile = utils.get_temp_filename('test.fa') shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile) shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile) fq_outfile = fq_infile + '.keep.fq' fa_outfile = fa_infile + '.keep.fa' in_dir_fq = os.path.dirname(fq_infile) in_dir_fa = os.path.dirname(fa_infile) args = [fq_infile, '-l', '10', '-o', 'fq_outfile'] (status, out, err) = runscript(script, args, in_dir_fa) countlines = sum(1 for line in open(fq_infile)) assert countlines == 44, countlines args = [fa_infile, '-l', '10', '-o', 'fa_outfile'] (status, out, err) = runscript(script, args, in_dir_fa) countlines = sum(1 for line in open(fa_infile)) assert countlines == 22, countlines
def test_normalize_by_median_force(): CUTOFF = '1' corrupt_infile = utils.get_temp_filename('test-corrupt.fq') good_infile = utils.get_temp_filename('test-good.fq', tempdir=os.path.dirname( corrupt_infile)) in_dir = os.path.dirname(good_infile) shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile) script = scriptpath('normalize-by-median.py') args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile] (status, out, err) = runscript(script, args, in_dir) test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed') test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT' test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA' assert test_ht.count(test_good_read[:17]) > 0 assert test_ht.count(test_good_read2[:17]) > 0 assert os.path.exists(corrupt_infile + '.ct.failed') assert '*** Skipping' in err assert '** IOErrors' in err
def test_sweep_reads_fq(): readfile = utils.get_temp_filename('reads.fa') contigfile = utils.get_temp_filename('contigs.fp') in_dir = os.path.dirname(contigfile) shutil.copyfile(utils.get_test_data('test-sweep-reads.fq'), readfile) shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) script = scriptpath('sweep-reads.py') args = [ '-k', '25', '--prefix', 'test', '--label-by-pid', contigfile, readfile, 'junkfile.fa' ] status, out, err = utils.runscript(script, args, in_dir, fail_ok=True, sandbox=True) # check if the bad file was skipped without issue assert 'ERROR' in err, err assert 'skipping' in err, err out1 = os.path.join(in_dir, 'test_0.fq') out2 = os.path.join(in_dir, 'test_1.fq') mout = os.path.join(in_dir, 'test_multi.fq') oout = os.path.join(in_dir, 'test_orphaned.fq') assert os.path.exists(out1) assert os.path.exists(out2) assert os.path.exists(mout) assert os.path.exists(oout) print open(out1).read() print os.listdir(in_dir) seqs1 = set([r.name for r in screed.open(out1)]) seqs2 = set([r.name for r in screed.open(out2)]) seqsm = set([r.name for r in screed.open(mout)]) seqso = set([r.name for r in screed.open(oout)]) print seqs1 print seqs2 print seqsm print seqso assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) assert seqs2 == set(['read3_p1\t1']) assert (seqsm == set(['read4_multi\t0\t1']) or seqsm == set(['read4_multi\t1\t0'])) assert seqso == set(['read5_orphan']) seqs1 = set([r.quality for r in screed.open(out1)]) seqs2 = set([r.quality for r in screed.open(out2)]) seqsm = set([r.quality for r in screed.open(mout)]) seqso = set([r.quality for r in screed.open(oout)])
def test_sweep_reads_fq(): readfile = utils.get_temp_filename('reads.fa') contigfile = utils.get_temp_filename('contigs.fp') in_dir = os.path.dirname(contigfile) shutil.copyfile(utils.get_test_data('test-sweep-reads.fq'), readfile) shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) script = scriptpath('sweep-reads.py') args = ['-k', '25', '--prefix', 'test', '--label-by-pid', contigfile, readfile, 'junkfile.fa'] status, out, err = utils.runscript( script, args, in_dir, fail_ok=True, sandbox=True) # check if the bad file was skipped without issue assert 'ERROR' in err, err assert 'skipping' in err, err out1 = os.path.join(in_dir, 'test_0.fq') out2 = os.path.join(in_dir, 'test_1.fq') mout = os.path.join(in_dir, 'test_multi.fq') oout = os.path.join(in_dir, 'test_orphaned.fq') assert os.path.exists(out1) assert os.path.exists(out2) assert os.path.exists(mout) assert os.path.exists(oout) print open(out1).read() print os.listdir(in_dir) seqs1 = set([r.name for r in screed.open(out1)]) seqs2 = set([r.name for r in screed.open(out2)]) seqsm = set([r.name for r in screed.open(mout)]) seqso = set([r.name for r in screed.open(oout)]) print seqs1 print seqs2 print seqsm print seqso assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) assert seqs2 == set(['read3_p1\t1']) assert (seqsm == set(['read4_multi\t0\t1']) or seqsm == set(['read4_multi\t1\t0'])) assert seqso == set(['read5_orphan']) seqs1 = set([r.quality for r in screed.open(out1)]) seqs2 = set([r.quality for r in screed.open(out2)]) seqsm = set([r.quality for r in screed.open(mout)]) seqso = set([r.quality for r in screed.open(oout)])
def test_save_no_load(self): filename = utils.get_temp_filename('tst') rt = self.rt rt.set(0, True) rt.set(1, False) rt.set(2, True) rt.set(3, False) rt.set(4, True) rt.save(filename) rt2 = khmer.new_readmask(READTABLE_SIZE) # no load! try: for i in range(5): if rt.get(i) != rt2.get(i): raise Exception # supposed to happen; no load. assert 0 except AssertionError: raise except Exception: pass
def test_save_no_load(self): filename = utils.get_temp_filename('save') mmt = self.mmt for i in range(0, MINMAXTABLE_SIZE): mmt.add_min(i, i) mmt.add_max(i, MINMAXTABLE_SIZE - i) mmt.save(filename) mmt2 = khmer.new_minmax(MINMAXTABLE_SIZE) # no load! try: for i in range(0, MINMAXTABLE_SIZE): if mmt2.get_min(i) != mmt.get_min(i): raise Exception # supposed to happen! assert 0 except AssertionError: raise except Exception: pass try: for i in range(0, MINMAXTABLE_SIZE): if mmt2.get_max(i) != mmt.get_max(i): raise Exception # supposed to happen! assert 0 except AssertionError: raise except Exception: pass
def test_filter_limit_n(self): ht = khmer.new_hashtable(4, 4**4) filename = utils.get_test_data('simple_3.fa') outname = utils.get_temp_filename('test_filter.out') (total_reads, n_consumed) = ht.consume_fasta(filename) assert total_reads == 2, total_reads (total_reads, n_seq_kept) = \ khmer.filter_fasta_file_limit_n(ht, filename, total_reads, outname, 2, 7) assert total_reads == 2 assert n_seq_kept == 1 (total_reads, n_seq_kept) = \ khmer.filter_fasta_file_limit_n(ht, filename, total_reads, outname, 2, 4) assert total_reads == 2 assert n_seq_kept == 2
def test_save_load_tagset_trunc(): ht = khmer.new_hashbits(32, 1, 1) outfile = utils.get_temp_filename('tagset') ht.add_tag('A' * 32) ht.add_tag('G' * 32) ht.save_tagset(outfile) ht.save_tagset('/tmp/goodversion-k32.tagset') # truncate tagset file... fp = open(outfile, 'rb') data = fp.read() fp.close() fp = open(outfile, 'wb') fp.write(data[:26]) fp.close() # try loading it... try: ht.load_tagset(outfile) assert 0, "this test should fail" except IOError: pass
def test_sweep_reads_3(): infile = utils.get_temp_filename('seqs.fa') shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile) wdir = os.path.dirname(infile) script = scriptpath('sweep-reads.py') args = ['-m', '75', '-k', '20', '-l', '1', '--prefix', 'test', '--label-by-group', '10', infile, infile] status, out, err = utils.runscript(script, args, wdir, sandbox=True) for i in xrange(10): p = os.path.join(wdir, 'test_{i}.fa'.format(i=i)) print p, err, out assert os.path.exists(p) os.remove(p) counts_fn = os.path.join(wdir, 'test.counts.csv') with open(counts_fn) as cfp: for line in cfp: _, _, c = line.partition(',') assert int(c) in [9, 10] assert os.path.exists(counts_fn) assert os.path.exists(os.path.join(wdir, 'test.dist.txt')) assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
def test_tiny_real_partitions(): filename = utils.get_test_data('real-partition-tiny.fa') ht = khmer.new_hashbits(32, 8e1, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) outfile = utils.get_temp_filename('part') ht.output_partitions(filename, outfile) data = open(outfile).read() assert len(data) records = [r for r in screed.open(outfile)] names = [r.name for r in records] parts = [n.rsplit('\t', 1)[1] for n in names] assert len(parts) == 2, len(parts) assert len(set(parts)) == 1 assert set(parts) != set(['0']) test_tiny_real_partitions.runme = True
def test_abundance_dist_single_nobigcount(): infile = utils.get_temp_filename('test.fa') outfile = utils.get_temp_filename('test.dist') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) script = scriptpath('abundance-dist-single.py') args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile] runscript(script, args, in_dir) fp = iter(open(outfile)) line = fp.next().strip() assert line == '1 96 96 0.98', line line = fp.next().strip() assert line == '255 2 98 1.0', line
def test_abund(self): ht = khmer.new_hashtable(10, 4**10) filename = utils.get_test_data('test-abund-read.fa') outname = utils.get_temp_filename('test_abund.out') ht.consume_fasta(filename) try: ht.consume_fasta() assert 0, "should fail" except TypeError as err: print str(err) try: ht.consume_fasta("nonexistent") assert 0, "should fail" except IOError as err: print str(err) ht.output_fasta_kmer_pos_freq(filename, outname) try: ht.output_fasta_kmer_pos_freq() assert 0, "should fail" except TypeError as err: print str(err) fd = open(outname, "r") output = fd.readlines() assert len(output) == 1 output = output[0] output = output.strip().split() assert ['1'] * (114 - 10 + 1) == output fd.close()
def test_abund(self): ht = khmer.new_hashtable(10, 4 ** 10) filename = utils.get_test_data('test-abund-read.fa') outname = utils.get_temp_filename('test_abund.out') ht.consume_fasta(filename) try: ht.consume_fasta() assert 0, "should fail" except TypeError as err: print str(err) try: ht.consume_fasta("nonexistent") assert 0, "should fail" except IOError as err: print str(err) ht.output_fasta_kmer_pos_freq(filename, outname) try: ht.output_fasta_kmer_pos_freq() assert 0, "should fail" except TypeError as err: print str(err) fd = open(outname, "r") output = fd.readlines() assert len(output) == 1 output = output[0] output = output.strip().split() assert ['1'] * (114 - 10 + 1) == output fd.close()
def test_feature_extraction_kmer(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('example.mito.fasta.shreded.subset'), infile) outfile = infile + '.kmer_vector' script = scriptpath('feature_extraction_kmer.py') mmp = os.path.abspath("../scripts/gm_parameters/par_11.modified") print mmp tmp = os.path.abspath("./") print tmp print in_dir args = [ "--input", infile, "--outfile", outfile, "--taxid", "12345", "--label", "taxid" ] utils.runscript(script, args, in_dir) assert os.path.exists(outfile), outfile print outfile data = [x.strip() for x in open(outfile)] print len(data) assert len(data) == 54 assert data[1].startswith("12345\tgi|511782593|ref|NC_021399.1") == True assert data[0].endswith( "0.00300180108065\t0.00640384230538\t0.00380228136882") == True assert data[-1].startswith( "12345\tgi|511782593|ref|NC_021399.1||pos|304493..309493") == True assert data[-1].endswith( "0.00300180108065\t0.00500300180108\t0.00200120072043") == True utils.cleanup()
def test_feature_extraction_metamark(): infile = utils.get_temp_filename('test.fa') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('example.mito.fasta.shreded.subset'), infile) outfile = infile + '.metamark_vector' script = scriptpath('feature_extraction_metamark.py') mmp = os.path.abspath("../scripts/gm_parameters/par_11.modified") print mmp tmp = os.path.abspath("./") print tmp print in_dir args = [ "--input", infile, "--outfile", outfile, "--tmp", tmp, "--mmp", mmp, "--taxid", "12345" ] utils.runscript(script, args, in_dir) assert os.path.exists(outfile), outfile print outfile data = [x.strip() for x in open(outfile)] print len(data) assert len(data) == 30 assert data[1].startswith("12345\tgi|511782593|ref|NC_021399.1") == True assert data[0].endswith("0.018679\t0.016415\t0.016415") == True assert data[-1].startswith( "12345\tgi|511782593|ref|NC_021399.1||pos|295582..300582") == True assert data[-1].endswith("0.023325\t0.019296\t0.021310") == True utils.cleanup()
def test_save_load_tagset_trunc(): ht = khmer.new_hashbits(32, 1, 1) outfile = utils.get_temp_filename('tagset') ht.add_tag('A' * 32) ht.add_tag('G' * 32) ht.save_tagset(outfile) # truncate tagset file... fp = open(outfile, 'rb') data = fp.read() fp.close() for i in range(len(data)): fp = open(outfile, 'wb') fp.write(data[:i]) fp.close() # try loading it... try: ht.load_tagset(outfile) assert 0, "this test should fail" except IOError as err: print str(err), i
def test_save_no_load(self): filename = utils.get_temp_filename('save') mmt = self.mmt for i in range(0, MINMAXTABLE_SIZE): mmt.add_min(i, i) mmt.add_max(i, MINMAXTABLE_SIZE-i) mmt.save(filename) mmt2 = khmer.new_minmax(MINMAXTABLE_SIZE) # no load! try: for i in range(0, MINMAXTABLE_SIZE): if mmt2.get_min(i) != mmt.get_min(i): raise Exception # supposed to happen! assert 0 except AssertionError: raise except Exception: pass try: for i in range(0, MINMAXTABLE_SIZE): if mmt2.get_max(i) != mmt.get_max(i): raise Exception # supposed to happen! assert 0 except AssertionError: raise except Exception: pass
def test_load_graph(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20', '-t'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) (status, out, err) = utils.runscript(script, args) assert 'Total number of k-mers: 3959' in err, err ht_file = outfile + '.pt' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert os.path.exists(tagset_file), tagset_file ht = khmer.load_hashbits(ht_file) ht.load_tagset(tagset_file) # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a # roundabout way of checking that load-graph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x
def test_abundance_dist_single_nobigcount(): infile = utils.get_temp_filename('test.fa') outfile = utils.get_temp_filename('test.dist') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile) script = scriptpath('abundance-dist-single.py') args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile] utils.runscript(script, args, in_dir) fp = iter(open(outfile)) line = fp.next().strip() assert line == '1 96 96 0.98', line line = fp.next().strip() assert line == '255 2 98 1.0', line
def test_load_graph(): script = scriptpath('load-graph.py') args = ['-x', '1e7', '-N', '2', '-k', '20'] outfile = utils.get_temp_filename('out') infile = utils.get_test_data('random-20-a.fa') args.extend([outfile, infile]) (status, out, err) = runscript(script, args) assert status == 0 ht_file = outfile + '.ht' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert os.path.exists(tagset_file), tagset_file ht = khmer.load_hashbits(ht_file) ht.load_tagset(tagset_file) # check to make sure we get the expected result for this data set # upon partitioning (all in one partition). This is kind of a # roundabout way of checking that load-graph worked :) subset = ht.do_subset_partition(0, 0) x = ht.subset_count_partitions(subset) assert x == (1, 0), x
def _DEBUG_make_graph(infilename, SIZE=1e7, N=2, K=20, do_partition=False, annotate_partitions=False, stop_big_traverse=False): script = scriptpath('load-graph.py') args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)] outfile = utils.get_temp_filename('out') infile = utils.get_test_data(infilename) args.extend([outfile, infile]) (status, out, err) = DEBUG_runscript(script, args) assert status == 0 ht_file = outfile + '.ht' assert os.path.exists(ht_file), ht_file tagset_file = outfile + '.tagset' assert os.path.exists(tagset_file), tagset_file if do_partition: print ">>>> DEBUG: Partitioning <<<" script = scriptpath('partition-graph.py') args = [outfile] if stop_big_traverse: args.insert(0, '--no-big-traverse') (status, out, err) = DEBUG_runscript(script, args) print out print err assert status == 0 print ">>>> DEBUG: Merging Partitions <<<" script = scriptpath('merge-partitions.py') args = [outfile, '-k', str(K)] (status, out, err) = DEBUG_runscript(script, args) print out print err assert status == 0 final_pmap_file = outfile + '.pmap.merged' assert os.path.exists(final_pmap_file) if annotate_partitions: print ">>>> DEBUG: Annotating Partitions <<<" script = scriptpath('annotate-partitions.py') args = ["-k", str(K), outfile, infilename] in_dir = os.path.dirname(outfile) (status, out, err) = DEBUG_runscript(script, args, in_dir) assert status == 0 baseinfile = os.path.basename(infilename) assert os.path.exists(os.path.join(in_dir, baseinfile + '.part')) return outfile
def test_check_file_status_kfile(): fn = utils.get_temp_filename('thisfiledoesnotexist') check_file_status_exited = False try: check_input_files(fn, False) except SystemExit: check_file_status_exited = True assert check_file_status_exited