Exemple #1
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = '1'

    infiles = [utils.get_temp_filename('test-0.fq')]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(
            utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-d', '2', '-C', CUTOFF, '-k', '17']
    args.extend(infiles)

    (status, out, err) = utils.runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct'))
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, 'backup.ct'))
    assert out.count('Backup: Saving') == 2
    assert 'Nothing' in out
Exemple #2
0
def test_split_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data("paired.fa")

    ex_outfile1 = utils.get_test_data("paired.fa.1")
    ex_outfile2 = utils.get_test_data("paired.fa.2")

    # actual output files...
    outfile1 = utils.get_temp_filename("paired.fa.1")
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename("paired.fa.2", in_dir)

    script = scriptpath("split-paired-reads.py")
    args = [infile]

    runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemple #3
0
def test_filter_stoptags():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    stopfile = utils.get_temp_filename('stoptags', in_dir)

    # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir.
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    # now, create a file with some stop tags in it --
    K = 18
    kh = khmer.new_hashbits(K, 1, 1)
    kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
    kh.save_stop_tags(stopfile)
    del kh

    # finally, run filter-stoptags.
    script = scriptpath('filter-stoptags.py')
    args = ['-k', str(K), stopfile, infile, infile]
    utils.runscript(script, args, in_dir)

    # verify that the basic output file exists
    outfile = infile + '.stopfilt'
    assert os.path.exists(outfile), outfile

    # it should contain only one unique sequence, because we've trimmed
    # off everything after the beginning of the only long sequence in there.
    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 1, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
Exemple #4
0
def test_normalize_by_median_force():
    CUTOFF = '1'

    corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
    good_infile = utils.get_temp_filename(
        'test-good.fq', tempdir=os.path.dirname(corrupt_infile))

    in_dir = os.path.dirname(good_infile)

    shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]

    (status, out, err) = utils.runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed')
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0
    assert os.path.exists(corrupt_infile + '.ct.failed')
    assert '*** Skipping' in err
    assert '** IOErrors' in err
Exemple #5
0
def test_extract_long_sequences():

    script = scriptpath('extract-long-sequences.py')
    fq_infile = utils.get_temp_filename('test.fq')
    fa_infile = utils.get_temp_filename('test.fa')

    shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile)
    shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile)

    fq_outfile = fq_infile + '.keep.fq'
    fa_outfile = fa_infile + '.keep.fa'

    in_dir_fq = os.path.dirname(fq_infile)
    in_dir_fa = os.path.dirname(fa_infile)

    args = [fq_infile, '-l', '10', '-o', 'fq_outfile']
    (status, out, err) = utils.runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fq_infile))
    assert countlines == 44, countlines

    args = [fa_infile, '-l', '10', '-o', 'fa_outfile']
    (status, out, err) = utils.runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fa_infile))
    assert countlines == 22, countlines
Exemple #6
0
def test_count_overlap():
    seqfile1 = utils.get_temp_filename('test-overlap1.fa')
    in_dir = os.path.dirname(seqfile1)
    seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir)
    outfile = utils.get_temp_filename('overlap.out', in_dir)
    curvefile = utils.get_temp_filename('overlap.out.curve', in_dir)
    shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
    shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
    htfile = _make_graph(seqfile1, ksize=20)
    script = scriptpath('count-overlap.py')
    args = [
        '--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000',
        htfile + '.pt', seqfile2, outfile
    ]
    (status, out, err) = utils.runscript(script, args, in_dir)
    assert status == 0
    assert os.path.exists(outfile), outfile
    data = [x.strip() for x in open(outfile)]
    data = set(data)
    assert '# of unique k-mers in dataset2: 759047' in data
    assert '# of overlap unique k-mers: 245621' in data
    assert os.path.exists(curvefile), curvefile
    data = [x.strip() for x in open(curvefile)]
    data = set(data)
    assert '178633 1155' in data
    assert '496285 2970' in data
    assert '752053 238627' in data
Exemple #7
0
def test_split_paired_reads_2_fq():
    # test input file
    infile = utils.get_test_data('paired.fq')

    ex_outfile1 = utils.get_test_data('paired.fq.1')
    ex_outfile2 = utils.get_test_data('paired.fq.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fq.1')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired.fq.2', in_dir)

    script = scriptpath('split-paired-reads.py')
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0
Exemple #8
0
def test_extract_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired-mixed.fa.pe')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir)

    script = scriptpath('extract-paired-reads.py')
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemple #9
0
def test_make_initial_stoptags():
    # gen input files using load-graph.py -t
    # should keep test_data directory size down
    # or something like that
    # this assumes (obv.) load-graph works properly
    bzinfile = utils.get_temp_filename('test-reads.fq.bz2')
    shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile)
    in_dir = os.path.dirname(bzinfile)

    genscript = scriptpath('load-graph.py')
    genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2']
    utils.runscript(genscript, genscriptargs, in_dir)

    # test input file gen'd by load-graphs
    infile = utils.get_temp_filename('test-reads.pt')
    infile2 = utils.get_temp_filename('test-reads.tagset', in_dir)

    # get file to compare against
    ex_outfile = utils.get_test_data('test-reads.stoptags')

    # actual output file
    outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir)

    script = scriptpath('make-initial-stoptags.py')
    # make-initial-stoptags has weird file argument syntax
    # read the code before modifying
    args = ['test-reads']

    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile1), outfile1
Exemple #10
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = "1"

    infiles = [utils.get_temp_filename("test-0.fq")]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(utils.get_temp_filename("test-{x}.fq".format(x=x), tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data("test-fastq-reads.fq"), infile)

    script = scriptpath("normalize-by-median.py")
    args = ["-d", "2", "-C", CUTOFF, "-k", "17"]
    args.extend(infiles)

    (status, out, err) = runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, "backup.ht"))
    test_good_read = "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT"
    test_good_read2 = "TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA"
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, "backup.ht"))
    assert out.count("Backup: Saving") == 2
    assert "Nothing" in out
    def test_save_merge_from_disk(self):
        ht = khmer.new_hashbits(20, 4 ** 4 + 1)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print divvy
        (a, b, c) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(a, b)
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(b, 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
    def test_save_load_merge(self):
        raise Exception("this test coredumps on lyorn.  Disabled.")
        ht = khmer.new_hashbits(20, 4**14+1)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print divvy
        (a, b, c) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')
        
        x = ht.do_subset_partition(a, b)
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(b, 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        a = ht.load_subset_partitionmap(outfile1)
        b = ht.load_subset_partitionmap(outfile2)

        ht.merge_subset(a)
        ht.merge_subset(b)
        
        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
Exemple #13
0
def test_abund_dist_gz_bigcount():
    infile = utils.get_temp_filename('test.fa')
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    outfile = utils.get_temp_filename('test_ct.gz')
    script = scriptpath('load-into-counting.py')
    htfile = utils.get_temp_filename('test_ct')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = open(htfile, 'rb').read()
    f_out = gzip.open(outfile, 'wb')  # compress the created bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    counting_hash = khmer.load_counting_hash(outfile)
    hashsizes = counting_hash.hashsizes()
    kmer_size = counting_hash.ksize()
    tracking = khmer._Hashbits(kmer_size, hashsizes)
    abundances = counting_hash.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print _, i
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Exemple #14
0
    def test_save_merge_from_disk_2(self):
        raise Exception("this test coredumps on lyorn.  Disabled.")
        ht = khmer.new_hashbits(20, 4**14 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads / 2 + total_reads % 2
        divvy = ht.divide_tags_into_subsets(subset_size)

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(divvy[0], divvy[1])
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(divvy[1], 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions  # combined.
Exemple #15
0
def test_save_load_corrupted():
    lb_pre = LabelHash(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old LabelHash
    del lb_pre

    lb = LabelHash(20, 1e7, 4)

    # produce all possible truncated versions of this file
    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        truncated = utils.get_temp_filename('trunc.labels')
        fp = open(truncated, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            lb.load_labels_and_tags(truncated)
            assert 0, "this should not succeed -- truncated file len %d" % (i,)
        except IOError as err:
            print 'expected failure for', i, ': ', str(err)
Exemple #16
0
    def test_save_merge_from_disk(self):
        raise Exception("this test coredumps on lyorn.  Disabled.")
        ht = khmer.new_hashbits(20, 4**14 + 1)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print divvy
        (a, b, c) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(a, b)
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(b, 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions  # combined.
Exemple #17
0
    def test_random_20_a_succ_IV_save(self):
        ht = khmer.new_hashbits(20, 4**13 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        savefile_ht = utils.get_temp_filename('ht')
        savefile_tags = utils.get_temp_filename('tags')
        outfile = filename + utils.get_temp_filename('out')

        total_reads, _ = ht.consume_fasta_and_tag(filename)

        ht.save(savefile_ht)
        ht.save_tagset(savefile_tags)

        del ht
        ht = khmer.new_hashbits(20, 4**13 + 1)

        ht.load(savefile_ht)
        ht.load_tagset(savefile_tags)

        divvy = ht.divide_tags_into_subsets(1)
        divvy.append(0)

        subsets = []
        for i in range(len(divvy) - 1):
            x = ht.do_subset_partition(divvy[i], divvy[i + 1])
            subsets.append(x)

        for x in reversed(subsets):
            ht.merge_subset(x)

        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions
Exemple #18
0
    def test_save_load_merge_2(self):
        ht = khmer.new_hashbits(20, 4**8 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads // 2 + total_reads % 2
        divvy = ht.divide_tags_into_subsets(subset_size)

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(divvy[0], divvy[1])
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(divvy[1], 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        assert os.path.exists(outfile1)
        assert os.path.exists(outfile2)
        a = ht.load_subset_partitionmap(outfile1)
        b = ht.load_subset_partitionmap(outfile2)

        ht.merge_subset(a)
        ht.merge_subset(b)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions  # combined.
Exemple #19
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._new_counting_hash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer._new_counting_hash(12, sizes)
    ht.load(loadpath)

    tracking = khmer._new_hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._new_hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
def test_abund_dist_gz_bigcount():
    infile = utils.get_temp_filename('test.fa')
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    outfile = utils.get_temp_filename('test_ct.gz')
    script = scriptpath('load-into-counting.py')
    htfile = utils.get_temp_filename('test_ct')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = open(htfile, 'rb').read()
    f_out = gzip.open(outfile, 'wb')  # compress the created bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    counting_hash = khmer.load_counting_hash(outfile)
    hashsizes = counting_hash.hashsizes()
    kmer_size = counting_hash.ksize()
    tracking = khmer._Hashbits(kmer_size, hashsizes)
    abundances = counting_hash.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print _, i
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
def test_save_load_merge_on_graph():
    ht = khmer.new_hashbits(20, 4 ** 4 + 1)
    filename = utils.get_test_data('test-graph2.fa')

    (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
    assert total_reads == 3, total_reads

    divvy = ht.divide_tags_into_subsets(1)
    print divvy
    assert len(divvy) is 3
    (a, b, c) = divvy

    outfile1 = utils.get_temp_filename('x.pmap')
    outfile2 = utils.get_temp_filename('y.pmap')

    x = ht.do_subset_partition(a, b)
    ht.save_subset_partitionmap(x, outfile1)
    del x

    y = ht.do_subset_partition(b, 0)
    ht.save_subset_partitionmap(y, outfile2)
    del y

    a = ht.load_partitionmap(outfile1)  # <-- this is different
    b = ht.load_subset_partitionmap(outfile2)

    ht.merge_subset(b)

    outfile = utils.get_temp_filename('out.part')
    n_partitions = ht.output_partitions(filename, outfile)
    assert n_partitions == 1, n_partitions        # combined.
Exemple #22
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = '1'

    infiles = [utils.get_temp_filename('test-0.fq')]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x),
                                               tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-d', '2', '-C', CUTOFF, '-k', '17']
    args.extend(infiles)

    (status, out, err) = runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct'))
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, 'backup.ct'))
    assert out.count('Backup: Saving') == 2
    assert 'Nothing' in out
Exemple #23
0
def test_extract_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired-mixed.fa.pe')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir)

    script = scriptpath('extract-paired-reads.py')
    args = [infile]

    runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Exemple #24
0
def test_make_initial_stoptags():
    # gen input files using load-graph.py -t
    # should keep test_data directory size down
    # or something like that
    # this assumes (obv.) load-graph works properly
    bzinfile = utils.get_temp_filename('test-reads.fq.bz2')
    shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile)
    in_dir = os.path.dirname(bzinfile)

    genscript = scriptpath('load-graph.py')
    genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2']
    utils.runscript(genscript, genscriptargs, in_dir)

    # test input file gen'd by load-graphs
    infile = utils.get_temp_filename('test-reads.pt')
    infile2 = utils.get_temp_filename('test-reads.tagset', in_dir)

    # get file to compare against
    ex_outfile = utils.get_test_data('test-reads.stoptags')

    # actual output file
    outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir)

    script = scriptpath('make-initial-stoptags.py')
    # make-initial-stoptags has weird file argument syntax
    # read the code before modifying
    args = ['test-reads']

    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile1), outfile1
    def test_random_20_a_succ_IV_save(self):
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        savefile_ht = utils.get_temp_filename('ht')
        savefile_tags = utils.get_temp_filename('tags')
        outfile = filename + utils.get_temp_filename('out')

        total_reads, _ = ht.consume_fasta_and_tag(filename)

        ht.save(savefile_ht)
        ht.save_tagset(savefile_tags)

        del ht
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)

        ht.load(savefile_ht)
        ht.load_tagset(savefile_tags)

        divvy = ht.divide_tags_into_subsets(1)
        divvy.append(0)

        subsets = []
        for i in range(len(divvy) - 1):
            x = ht.do_subset_partition(divvy[i], divvy[i + 1])
            subsets.append(x)

        for x in reversed(subsets):
            ht.merge_subset(x)

        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions
Exemple #26
0
    def test_save_load_merge(self):
        ht = khmer.new_hashbits(20, 4**4 + 1)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print divvy
        assert len(divvy) is 3
        (a, b, c) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(a, b)
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(b, 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        a = ht.load_subset_partitionmap(outfile1)
        b = ht.load_subset_partitionmap(outfile2)

        ht.merge_subset(a)
        ht.merge_subset(b)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions  # combined.
Exemple #27
0
def test_filter_stoptags():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    stopfile = utils.get_temp_filename('stoptags', in_dir)

    # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir.
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    # now, create a file with some stop tags in it --
    K = 18
    kh = khmer.new_hashbits(K, 1, 1)
    kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
    kh.save_stop_tags(stopfile)
    del kh
    
    # finally, run filter-stoptags.
    script = scriptpath('filter-stoptags.py')
    args = ['-k', str(K), stopfile, infile, infile]
    (status, out, err) = runscript(script, args, in_dir)
    print out
    print err
    assert status == 0

    # verify that the basic output file exists
    outfile = infile + '.stopfilt'
    assert os.path.exists(outfile), outfile

    # it should contain only one unique sequence, because we've trimmed
    # off everything after the beginning of the only long sequence in there.
    seqs = set([ r.sequence for r in screed.open(outfile) ])
    assert len(seqs) == 1, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
    def test_save_load_merge_2(self):
        raise Exception("this test coredumps on lyorn.  Disabled.")
        ht = khmer.new_hashbits(20, 4**14+1)
        filename = utils.get_test_data('random-20-a.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads / 2 + total_reads % 2;
        divvy = ht.divide_tags_into_subsets(subset_size)

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')
        
        x = ht.do_subset_partition(divvy[0], divvy[1])
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(divvy[1], 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        a = ht.load_subset_partitionmap(outfile1)
        b = ht.load_subset_partitionmap(outfile2)

        ht.merge_subset(a)
        ht.merge_subset(b)
        
        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
Exemple #29
0
def test_count_overlap():
    seqfile1 = utils.get_temp_filename('test-overlap1.fa')
    in_dir = os.path.dirname(seqfile1)
    seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir)
    outfile = utils.get_temp_filename('overlap.out', in_dir)
    curvefile = utils.get_temp_filename('overlap.out.curve', in_dir)
    shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
    shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
    htfile = _make_graph(seqfile1, ksize=20)
    script = scriptpath('count-overlap.py')
    args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000',
            htfile + '.pt', seqfile2, outfile]
    (status, out, err) = runscript(script, args, in_dir)
    assert status == 0
    assert os.path.exists(outfile), outfile
    data = [x.strip() for x in open(outfile)]
    data = set(data)
    assert '# of unique k-mers in dataset2: 759047' in data
    assert '# of overlap unique k-mers: 245621' in data
    assert os.path.exists(curvefile), curvefile
    data = [x.strip() for x in open(curvefile)]
    data = set(data)
    assert '178633 1155' in data
    assert '496285 2970' in data
    assert '752053 238627' in data
Exemple #30
0
def test_split_paired_reads_2_fq():
    # test input file
    infile = utils.get_test_data('paired.fq')

    ex_outfile1 = utils.get_test_data('paired.fq.1')
    ex_outfile2 = utils.get_test_data('paired.fq.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fq.1')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired.fq.2', in_dir)

    script = scriptpath('split-paired-reads.py')
    args = [infile]

    runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0
    def test_save_merge_from_disk_2(self):
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads // 2 + total_reads % 2
        divvy = ht.divide_tags_into_subsets(subset_size)

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(divvy[0], divvy[1])
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(divvy[1], 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
Exemple #32
0
def test_extract_long_sequences():

    script = scriptpath('extract-long-sequences.py')
    fq_infile = utils.get_temp_filename('test.fq')
    fa_infile = utils.get_temp_filename('test.fa')

    shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile)
    shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile)

    fq_outfile = fq_infile + '.keep.fq'
    fa_outfile = fa_infile + '.keep.fa'

    in_dir_fq = os.path.dirname(fq_infile)
    in_dir_fa = os.path.dirname(fa_infile)

    args = [fq_infile, '-l', '10', '-o', 'fq_outfile']
    (status, out, err) = runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fq_infile))
    assert countlines == 44, countlines

    args = [fa_infile, '-l', '10', '-o', 'fa_outfile']
    (status, out, err) = runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fa_infile))
    assert countlines == 22, countlines
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._new_counting_hash(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer._new_counting_hash(12, sizes)
    ht.load(loadpath)

    tracking = khmer._new_hashbits(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._new_hashbits(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Exemple #34
0
def test_normalize_by_median_force():
    CUTOFF = '1'

    corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
    good_infile = utils.get_temp_filename('test-good.fq',
                                          tempdir=os.path.dirname(
                                              corrupt_infile))

    in_dir = os.path.dirname(good_infile)

    shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]

    (status, out, err) = runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed')
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0
    assert os.path.exists(corrupt_infile + '.ct.failed')
    assert '*** Skipping' in err
    assert '** IOErrors' in err
def test_sweep_reads_fq():
    readfile = utils.get_temp_filename('reads.fa')
    contigfile = utils.get_temp_filename('contigs.fp')
    in_dir = os.path.dirname(contigfile)

    shutil.copyfile(utils.get_test_data('test-sweep-reads.fq'), readfile)
    shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)

    script = scriptpath('sweep-reads.py')
    args = [
        '-k', '25', '--prefix', 'test', '--label-by-pid', contigfile, readfile,
        'junkfile.fa'
    ]

    status, out, err = utils.runscript(script,
                                       args,
                                       in_dir,
                                       fail_ok=True,
                                       sandbox=True)

    # check if the bad file was skipped without issue
    assert 'ERROR' in err, err
    assert 'skipping' in err, err

    out1 = os.path.join(in_dir, 'test_0.fq')
    out2 = os.path.join(in_dir, 'test_1.fq')
    mout = os.path.join(in_dir, 'test_multi.fq')
    oout = os.path.join(in_dir, 'test_orphaned.fq')

    assert os.path.exists(out1)
    assert os.path.exists(out2)
    assert os.path.exists(mout)
    assert os.path.exists(oout)
    print open(out1).read()

    print os.listdir(in_dir)

    seqs1 = set([r.name for r in screed.open(out1)])
    seqs2 = set([r.name for r in screed.open(out2)])
    seqsm = set([r.name for r in screed.open(mout)])
    seqso = set([r.name for r in screed.open(oout)])

    print seqs1
    print seqs2
    print seqsm
    print seqso
    assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
    assert seqs2 == set(['read3_p1\t1'])
    assert (seqsm == set(['read4_multi\t0\t1'])
            or seqsm == set(['read4_multi\t1\t0']))
    assert seqso == set(['read5_orphan'])

    seqs1 = set([r.quality for r in screed.open(out1)])
    seqs2 = set([r.quality for r in screed.open(out2)])
    seqsm = set([r.quality for r in screed.open(mout)])
    seqso = set([r.quality for r in screed.open(oout)])
def test_sweep_reads_fq():
    readfile = utils.get_temp_filename('reads.fa')
    contigfile = utils.get_temp_filename('contigs.fp')
    in_dir = os.path.dirname(contigfile)

    shutil.copyfile(utils.get_test_data('test-sweep-reads.fq'), readfile)
    shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)

    script = scriptpath('sweep-reads.py')
    args = ['-k', '25', '--prefix', 'test', '--label-by-pid',
            contigfile, readfile, 'junkfile.fa']

    status, out, err = utils.runscript(
        script, args, in_dir, fail_ok=True, sandbox=True)

    # check if the bad file was skipped without issue
    assert 'ERROR' in err, err
    assert 'skipping' in err, err

    out1 = os.path.join(in_dir, 'test_0.fq')
    out2 = os.path.join(in_dir, 'test_1.fq')
    mout = os.path.join(in_dir, 'test_multi.fq')
    oout = os.path.join(in_dir, 'test_orphaned.fq')

    assert os.path.exists(out1)
    assert os.path.exists(out2)
    assert os.path.exists(mout)
    assert os.path.exists(oout)
    print open(out1).read()

    print os.listdir(in_dir)

    seqs1 = set([r.name for r in screed.open(out1)])
    seqs2 = set([r.name for r in screed.open(out2)])
    seqsm = set([r.name for r in screed.open(mout)])
    seqso = set([r.name for r in screed.open(oout)])

    print seqs1
    print seqs2
    print seqsm
    print seqso
    assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
    assert seqs2 == set(['read3_p1\t1'])
    assert (seqsm == set(['read4_multi\t0\t1']) or
            seqsm == set(['read4_multi\t1\t0']))
    assert seqso == set(['read5_orphan'])

    seqs1 = set([r.quality for r in screed.open(out1)])
    seqs2 = set([r.quality for r in screed.open(out2)])
    seqsm = set([r.quality for r in screed.open(mout)])
    seqso = set([r.quality for r in screed.open(oout)])
Exemple #37
0
    def test_save_no_load(self):
        filename = utils.get_temp_filename('tst')

        rt = self.rt

        rt.set(0, True)
        rt.set(1, False)
        rt.set(2, True)
        rt.set(3, False)
        rt.set(4, True)

        rt.save(filename)

        rt2 = khmer.new_readmask(READTABLE_SIZE)
        # no load!

        try:
            for i in range(5):
                if rt.get(i) != rt2.get(i):
                    raise Exception  # supposed to happen; no load.
            assert 0
        except AssertionError:
            raise
        except Exception:
            pass
Exemple #38
0
    def test_save_no_load(self):
        filename = utils.get_temp_filename('save')

        mmt = self.mmt

        for i in range(0, MINMAXTABLE_SIZE):
            mmt.add_min(i, i)
            mmt.add_max(i, MINMAXTABLE_SIZE - i)

        mmt.save(filename)

        mmt2 = khmer.new_minmax(MINMAXTABLE_SIZE)
        # no load!

        try:
            for i in range(0, MINMAXTABLE_SIZE):
                if mmt2.get_min(i) != mmt.get_min(i):
                    raise Exception  # supposed to happen!

            assert 0
        except AssertionError:
            raise
        except Exception:
            pass

        try:
            for i in range(0, MINMAXTABLE_SIZE):
                if mmt2.get_max(i) != mmt.get_max(i):
                    raise Exception  # supposed to happen!
            assert 0
        except AssertionError:
            raise
        except Exception:
            pass
Exemple #39
0
    def test_filter_limit_n(self):
        ht = khmer.new_hashtable(4, 4**4)

        filename = utils.get_test_data('simple_3.fa')
        outname = utils.get_temp_filename('test_filter.out')

        (total_reads, n_consumed) = ht.consume_fasta(filename)
        assert total_reads == 2, total_reads

        (total_reads, n_seq_kept) = \
            khmer.filter_fasta_file_limit_n(ht, filename,
                                            total_reads,
                                            outname, 2,
                                            7)

        assert total_reads == 2
        assert n_seq_kept == 1


        (total_reads, n_seq_kept) = \
            khmer.filter_fasta_file_limit_n(ht, filename,
                                            total_reads,
                                            outname, 2,
                                            4)

        assert total_reads == 2
        assert n_seq_kept == 2
Exemple #40
0
def test_save_load_tagset_trunc():
    ht = khmer.new_hashbits(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.add_tag('G' * 32)
    ht.save_tagset(outfile)
    ht.save_tagset('/tmp/goodversion-k32.tagset')

    # truncate tagset file...
    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()

    fp = open(outfile, 'wb')
    fp.write(data[:26])
    fp.close()

    # try loading it...
    try:
        ht.load_tagset(outfile)
        assert 0, "this test should fail"
    except IOError:
        pass
def test_sweep_reads_3():

    infile = utils.get_temp_filename('seqs.fa')
    shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
    wdir = os.path.dirname(infile)
    script = scriptpath('sweep-reads.py')
    args = ['-m', '75', '-k', '20', '-l', '1', '--prefix',
            'test', '--label-by-group', '10', infile, infile]
    status, out, err = utils.runscript(script, args, wdir, sandbox=True)

    for i in xrange(10):
        p = os.path.join(wdir, 'test_{i}.fa'.format(i=i))
        print p, err, out
        assert os.path.exists(p)
        os.remove(p)

    counts_fn = os.path.join(wdir, 'test.counts.csv')
    with open(counts_fn) as cfp:
        for line in cfp:
            _, _, c = line.partition(',')
            assert int(c) in [9, 10]

    assert os.path.exists(counts_fn)
    assert os.path.exists(os.path.join(wdir, 'test.dist.txt'))
    assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
Exemple #42
0
def test_save_load_tagset_trunc():
    ht = khmer.new_hashbits(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.add_tag('G' * 32)
    ht.save_tagset(outfile)
    ht.save_tagset('/tmp/goodversion-k32.tagset')

    # truncate tagset file...
    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()

    fp = open(outfile, 'wb')
    fp.write(data[:26])
    fp.close()

    # try loading it...
    try:
        ht.load_tagset(outfile)
        assert 0, "this test should fail"
    except IOError:
        pass
Exemple #43
0
def test_tiny_real_partitions():
    filename = utils.get_test_data('real-partition-tiny.fa')

    ht = khmer.new_hashbits(32, 8e1, 4)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    outfile = utils.get_temp_filename('part')
    ht.output_partitions(filename, outfile)

    data = open(outfile).read()

    assert len(data)

    records = [r for r in screed.open(outfile)]
    names = [r.name for r in records]
    parts = [n.rsplit('\t', 1)[1] for n in names]

    assert len(parts) == 2, len(parts)
    assert len(set(parts)) == 1
    assert set(parts) != set(['0'])

    test_tiny_real_partitions.runme = True
Exemple #44
0
def test_abundance_dist_single_nobigcount():
    infile = utils.get_temp_filename('test.fa')
    outfile = utils.get_temp_filename('test.dist')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    script = scriptpath('abundance-dist-single.py')
    args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile]
    runscript(script, args, in_dir)

    fp = iter(open(outfile))
    line = fp.next().strip()
    assert line == '1 96 96 0.98', line
    line = fp.next().strip()
    assert line == '255 2 98 1.0', line
Exemple #45
0
    def test_abund(self):
        ht = khmer.new_hashtable(10, 4**10)

        filename = utils.get_test_data('test-abund-read.fa')
        outname = utils.get_temp_filename('test_abund.out')

        ht.consume_fasta(filename)
        try:
            ht.consume_fasta()
            assert 0, "should fail"
        except TypeError as err:
            print str(err)
        try:
            ht.consume_fasta("nonexistent")
            assert 0, "should fail"
        except IOError as err:
            print str(err)
        ht.output_fasta_kmer_pos_freq(filename, outname)
        try:
            ht.output_fasta_kmer_pos_freq()
            assert 0, "should fail"
        except TypeError as err:
            print str(err)

        fd = open(outname, "r")

        output = fd.readlines()
        assert len(output) == 1

        output = output[0]
        output = output.strip().split()

        assert ['1'] * (114 - 10 + 1) == output

        fd.close()
Exemple #46
0
    def test_abund(self):
        ht = khmer.new_hashtable(10, 4 ** 10)

        filename = utils.get_test_data('test-abund-read.fa')
        outname = utils.get_temp_filename('test_abund.out')

        ht.consume_fasta(filename)
        try:
            ht.consume_fasta()
            assert 0, "should fail"
        except TypeError as err:
            print str(err)
        try:
            ht.consume_fasta("nonexistent")
            assert 0, "should fail"
        except IOError as err:
            print str(err)
        ht.output_fasta_kmer_pos_freq(filename, outname)
        try:
            ht.output_fasta_kmer_pos_freq()
            assert 0, "should fail"
        except TypeError as err:
            print str(err)

        fd = open(outname, "r")

        output = fd.readlines()
        assert len(output) == 1

        output = output[0]
        output = output.strip().split()

        assert ['1'] * (114 - 10 + 1) == output

        fd.close()
Exemple #47
0
def test_feature_extraction_kmer():

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    shutil.copyfile(utils.get_test_data('example.mito.fasta.shreded.subset'),
                    infile)

    outfile = infile + '.kmer_vector'
    script = scriptpath('feature_extraction_kmer.py')
    mmp = os.path.abspath("../scripts/gm_parameters/par_11.modified")
    print mmp
    tmp = os.path.abspath("./")
    print tmp
    print in_dir
    args = [
        "--input", infile, "--outfile", outfile, "--taxid", "12345", "--label",
        "taxid"
    ]
    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile), outfile
    print outfile
    data = [x.strip() for x in open(outfile)]
    print len(data)
    assert len(data) == 54
    assert data[1].startswith("12345\tgi|511782593|ref|NC_021399.1") == True
    assert data[0].endswith(
        "0.00300180108065\t0.00640384230538\t0.00380228136882") == True
    assert data[-1].startswith(
        "12345\tgi|511782593|ref|NC_021399.1||pos|304493..309493") == True
    assert data[-1].endswith(
        "0.00300180108065\t0.00500300180108\t0.00200120072043") == True
    utils.cleanup()
Exemple #48
0
def test_feature_extraction_metamark():

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    shutil.copyfile(utils.get_test_data('example.mito.fasta.shreded.subset'),
                    infile)

    outfile = infile + '.metamark_vector'
    script = scriptpath('feature_extraction_metamark.py')
    mmp = os.path.abspath("../scripts/gm_parameters/par_11.modified")
    print mmp
    tmp = os.path.abspath("./")
    print tmp
    print in_dir
    args = [
        "--input", infile, "--outfile", outfile, "--tmp", tmp, "--mmp", mmp,
        "--taxid", "12345"
    ]
    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile), outfile
    print outfile
    data = [x.strip() for x in open(outfile)]
    print len(data)
    assert len(data) == 30
    assert data[1].startswith("12345\tgi|511782593|ref|NC_021399.1") == True
    assert data[0].endswith("0.018679\t0.016415\t0.016415") == True
    assert data[-1].startswith(
        "12345\tgi|511782593|ref|NC_021399.1||pos|295582..300582") == True
    assert data[-1].endswith("0.023325\t0.019296\t0.021310") == True
    utils.cleanup()
def test_save_load_tagset_trunc():
    ht = khmer.new_hashbits(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.add_tag('G' * 32)
    ht.save_tagset(outfile)

    # truncate tagset file...
    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()

    for i in range(len(data)):
        fp = open(outfile, 'wb')
        fp.write(data[:i])
        fp.close()

        # try loading it...
        try:
            ht.load_tagset(outfile)
            assert 0, "this test should fail"
        except IOError as err:
            print str(err), i
    def test_filter_limit_n(self):
        ht = khmer.new_hashtable(4, 4**4)

        filename = utils.get_test_data('simple_3.fa')
        outname = utils.get_temp_filename('test_filter.out')

        (total_reads, n_consumed) = ht.consume_fasta(filename)
        assert total_reads == 2, total_reads

        (total_reads, n_seq_kept) = \
            khmer.filter_fasta_file_limit_n(ht, filename,
                                            total_reads,
                                            outname, 2,
                                            7)


        assert total_reads == 2
        assert n_seq_kept == 1 

 
        (total_reads, n_seq_kept) = \
            khmer.filter_fasta_file_limit_n(ht, filename,
                                            total_reads,
                                            outname, 2,
                                            4)

        assert total_reads == 2
        assert n_seq_kept == 2
    def test_save_no_load(self):
        filename = utils.get_temp_filename('save')

        mmt = self.mmt

        for i in range(0, MINMAXTABLE_SIZE):
            mmt.add_min(i, i)
            mmt.add_max(i, MINMAXTABLE_SIZE-i)

        mmt.save(filename)

        mmt2 = khmer.new_minmax(MINMAXTABLE_SIZE)
        # no load!

        try:
            for i in range(0, MINMAXTABLE_SIZE):
                if mmt2.get_min(i) != mmt.get_min(i):
                    raise Exception         # supposed to happen!

            assert 0
        except AssertionError:
            raise
        except Exception:
            pass

        try:
            for i in range(0, MINMAXTABLE_SIZE):
                if mmt2.get_max(i) != mmt.get_max(i):
                    raise Exception         # supposed to happen!
            assert 0
        except AssertionError:
            raise
        except Exception:
            pass
def test_tiny_real_partitions():
    filename = utils.get_test_data('real-partition-tiny.fa')

    ht = khmer.new_hashbits(32, 8e1, 4)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    outfile = utils.get_temp_filename('part')
    ht.output_partitions(filename, outfile)

    data = open(outfile).read()

    assert len(data)

    records = [r for r in screed.open(outfile)]
    names = [r.name for r in records]
    parts = [n.rsplit('\t', 1)[1] for n in names]

    assert len(parts) == 2, len(parts)
    assert len(set(parts)) == 1
    assert set(parts) != set(['0'])

    test_tiny_real_partitions.runme = True
    def test_save_no_load(self):
        filename = utils.get_temp_filename('tst')

        rt = self.rt
        
        rt.set(0, True)
        rt.set(1, False)
        rt.set(2, True)
        rt.set(3, False)
        rt.set(4, True)

        rt.save(filename)
        
        rt2 = khmer.new_readmask(READTABLE_SIZE)
        # no load!

        try:
            for i in range(5):
                if rt.get(i) != rt2.get(i):
                    raise Exception     # supposed to happen; no load.
            assert 0
        except AssertionError:
            raise
        except Exception:
            pass
Exemple #54
0
def test_load_graph():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20', '-t']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    (status, out, err) = utils.runscript(script, args)

    assert 'Total number of k-mers: 3959' in err, err

    ht_file = outfile + '.pt'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
    ht.load_tagset(tagset_file)

    # check to make sure we get the expected result for this data set
    # upon partitioning (all in one partition).  This is kind of a
    # roundabout way of checking that load-graph worked :)
    subset = ht.do_subset_partition(0, 0)
    x = ht.subset_count_partitions(subset)
    assert x == (1, 0), x
Exemple #55
0
def test_abundance_dist_single_nobigcount():
    infile = utils.get_temp_filename('test.fa')
    outfile = utils.get_temp_filename('test.dist')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    script = scriptpath('abundance-dist-single.py')
    args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile]
    utils.runscript(script, args, in_dir)

    fp = iter(open(outfile))
    line = fp.next().strip()
    assert line == '1 96 96 0.98', line
    line = fp.next().strip()
    assert line == '255 2 98 1.0', line
Exemple #56
0
def test_load_graph():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    (status, out, err) = runscript(script, args)
    assert status == 0

    ht_file = outfile + '.ht'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
    ht.load_tagset(tagset_file)

    # check to make sure we get the expected result for this data set
    # upon partitioning (all in one partition).  This is kind of a
    # roundabout way of checking that load-graph worked :)
    subset = ht.do_subset_partition(0, 0)
    x = ht.subset_count_partitions(subset)
    assert x == (1, 0), x
def test_sweep_reads_3():

    infile = utils.get_temp_filename('seqs.fa')
    shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
    wdir = os.path.dirname(infile)
    script = scriptpath('sweep-reads.py')
    args = ['-m', '75', '-k', '20', '-l', '1', '--prefix',
            'test', '--label-by-group', '10', infile, infile]
    status, out, err = utils.runscript(script, args, wdir, sandbox=True)

    for i in xrange(10):
        p = os.path.join(wdir, 'test_{i}.fa'.format(i=i))
        print p, err, out
        assert os.path.exists(p)
        os.remove(p)

    counts_fn = os.path.join(wdir, 'test.counts.csv')
    with open(counts_fn) as cfp:
        for line in cfp:
            _, _, c = line.partition(',')
            assert int(c) in [9, 10]

    assert os.path.exists(counts_fn)
    assert os.path.exists(os.path.join(wdir, 'test.dist.txt'))
    assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
Exemple #58
0
def _DEBUG_make_graph(infilename,
                      SIZE=1e7,
                      N=2,
                      K=20,
                      do_partition=False,
                      annotate_partitions=False,
                      stop_big_traverse=False):
    script = scriptpath('load-graph.py')
    args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)]

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data(infilename)

    args.extend([outfile, infile])

    (status, out, err) = DEBUG_runscript(script, args)
    assert status == 0

    ht_file = outfile + '.ht'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    if do_partition:
        print ">>>> DEBUG: Partitioning <<<"
        script = scriptpath('partition-graph.py')
        args = [outfile]
        if stop_big_traverse:
            args.insert(0, '--no-big-traverse')
        (status, out, err) = DEBUG_runscript(script, args)
        print out
        print err
        assert status == 0

        print ">>>> DEBUG: Merging Partitions <<<"
        script = scriptpath('merge-partitions.py')
        args = [outfile, '-k', str(K)]
        (status, out, err) = DEBUG_runscript(script, args)
        print out
        print err
        assert status == 0

        final_pmap_file = outfile + '.pmap.merged'
        assert os.path.exists(final_pmap_file)

        if annotate_partitions:
            print ">>>> DEBUG: Annotating Partitions <<<"
            script = scriptpath('annotate-partitions.py')
            args = ["-k", str(K), outfile, infilename]

            in_dir = os.path.dirname(outfile)
            (status, out, err) = DEBUG_runscript(script, args, in_dir)
            assert status == 0

            baseinfile = os.path.basename(infilename)
            assert os.path.exists(os.path.join(in_dir, baseinfile + '.part'))

    return outfile
Exemple #59
0
def test_check_file_status_kfile():
    fn = utils.get_temp_filename('thisfiledoesnotexist')
    check_file_status_exited = False
    try:
        check_input_files(fn, False)
    except SystemExit:
        check_file_status_exited = True
    assert check_file_status_exited