Example #1
0
def test_extract_partitions_output_unassigned():
    seqfile = utils.get_test_data('random-20-a.fa')
    graphbase = _make_graph(
        seqfile, do_partition=True, annotate_partitions=True)
    in_dir = os.path.dirname(graphbase)

    # get the final part file
    partfile = os.path.join(in_dir, 'random-20-a.fa.part')

    # ok, now run extract-partitions.
    script = scriptpath('extract-partitions.py')
    args = ['-U', 'extracted', partfile]

    utils.runscript(script, args, in_dir)

    distfile = os.path.join(in_dir, 'extracted.dist')
    groupfile = os.path.join(in_dir, 'extracted.group0000.fa')
    unassigned_file = os.path.join(in_dir, 'extracted.unassigned.fa')
    assert os.path.exists(distfile)
    assert os.path.exists(groupfile)
    assert os.path.exists(unassigned_file)

    dist = open(distfile).readline()
    assert dist.strip() == '99 1 1 99'

    parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
    assert len(parts) == 99, len(parts)
    parts = set(parts)
    assert len(parts) == 1, len(parts)
Example #2
0
def test_make_initial_stoptags():
    # gen input files using load-graph.py -t
    # should keep test_data directory size down
    # or something like that
    # this assumes (obv.) load-graph works properly
    bzinfile = utils.get_temp_filename('test-reads.fq.bz2')
    shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile)
    in_dir = os.path.dirname(bzinfile)

    genscript = scriptpath('load-graph.py')
    genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2']
    utils.runscript(genscript, genscriptargs, in_dir)

    # test input file gen'd by load-graphs
    infile = utils.get_temp_filename('test-reads.pt')
    infile2 = utils.get_temp_filename('test-reads.tagset', in_dir)

    # get file to compare against
    ex_outfile = utils.get_test_data('test-reads.stoptags')

    # actual output file
    outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir)

    script = scriptpath('make-initial-stoptags.py')
    # make-initial-stoptags has weird file argument syntax
    # read the code before modifying
    args = ['test-reads']

    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile1), outfile1
Example #3
0
def test_extract_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired-mixed.fa.pe')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir)

    script = scriptpath('extract-paired-reads.py')
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Example #4
0
def test_feature_extraction_metamark():

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    shutil.copyfile(utils.get_test_data('example.mito.fasta.shreded.subset'),
                    infile)

    outfile = infile + '.metamark_vector'
    script = scriptpath('feature_extraction_metamark.py')
    mmp = os.path.abspath("../scripts/gm_parameters/par_11.modified")
    print mmp
    tmp = os.path.abspath("./")
    print tmp
    print in_dir
    args = [
        "--input", infile, "--outfile", outfile, "--tmp", tmp, "--mmp", mmp,
        "--taxid", "12345"
    ]
    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile), outfile
    print outfile
    data = [x.strip() for x in open(outfile)]
    print len(data)
    assert len(data) == 30
    assert data[1].startswith("12345\tgi|511782593|ref|NC_021399.1") == True
    assert data[0].endswith("0.018679\t0.016415\t0.016415") == True
    assert data[-1].startswith(
        "12345\tgi|511782593|ref|NC_021399.1||pos|295582..300582") == True
    assert data[-1].endswith("0.023325\t0.019296\t0.021310") == True
    utils.cleanup()
Example #5
0
def test_filter_stoptags():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    stopfile = utils.get_temp_filename('stoptags', in_dir)

    # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir.
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    # now, create a file with some stop tags in it --
    K = 18
    kh = khmer.new_hashbits(K, 1, 1)
    kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
    kh.save_stop_tags(stopfile)
    del kh

    # finally, run filter-stoptags.
    script = scriptpath('filter-stoptags.py')
    args = ['-k', str(K), stopfile, infile, infile]
    utils.runscript(script, args, in_dir)

    # verify that the basic output file exists
    outfile = infile + '.stopfilt'
    assert os.path.exists(outfile), outfile

    # it should contain only one unique sequence, because we've trimmed
    # off everything after the beginning of the only long sequence in there.
    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 1, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
Example #6
0
def test_extract_partitions_output_unassigned():
    seqfile = utils.get_test_data('random-20-a.fa')
    graphbase = _make_graph(seqfile,
                            do_partition=True,
                            annotate_partitions=True)
    in_dir = os.path.dirname(graphbase)

    # get the final part file
    partfile = os.path.join(in_dir, 'random-20-a.fa.part')

    # ok, now run extract-partitions.
    script = scriptpath('extract-partitions.py')
    args = ['-U', 'extracted', partfile]

    utils.runscript(script, args, in_dir)

    distfile = os.path.join(in_dir, 'extracted.dist')
    groupfile = os.path.join(in_dir, 'extracted.group0000.fa')
    unassigned_file = os.path.join(in_dir, 'extracted.unassigned.fa')
    assert os.path.exists(distfile)
    assert os.path.exists(groupfile)
    assert os.path.exists(unassigned_file)

    dist = open(distfile).readline()
    assert dist.strip() == '99 1 1 99'

    parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
    assert len(parts) == 99, len(parts)
    parts = set(parts)
    assert len(parts) == 1, len(parts)
Example #7
0
def test_abund_dist_gz_bigcount():
    infile = utils.get_temp_filename('test.fa')
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    outfile = utils.get_temp_filename('test_ct.gz')
    script = scriptpath('load-into-counting.py')
    htfile = utils.get_temp_filename('test_ct')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = open(htfile, 'rb').read()
    f_out = gzip.open(outfile, 'wb')  # compress the created bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    counting_hash = khmer.load_counting_hash(outfile)
    hashsizes = counting_hash.hashsizes()
    kmer_size = counting_hash.ksize()
    tracking = khmer._Hashbits(kmer_size, hashsizes)
    abundances = counting_hash.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print _, i
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Example #8
0
def test_split_paired_reads_2_fq():
    # test input file
    infile = utils.get_test_data('paired.fq')

    ex_outfile1 = utils.get_test_data('paired.fq.1')
    ex_outfile2 = utils.get_test_data('paired.fq.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fq.1')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired.fq.2', in_dir)

    script = scriptpath('split-paired-reads.py')
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0
Example #9
0
def test_extract_long_sequences():

    script = scriptpath('extract-long-sequences.py')
    fq_infile = utils.get_temp_filename('test.fq')
    fa_infile = utils.get_temp_filename('test.fa')

    shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile)
    shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile)

    fq_outfile = fq_infile + '.keep.fq'
    fa_outfile = fa_infile + '.keep.fa'

    in_dir_fq = os.path.dirname(fq_infile)
    in_dir_fa = os.path.dirname(fa_infile)

    args = [fq_infile, '-l', '10', '-o', 'fq_outfile']
    (status, out, err) = utils.runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fq_infile))
    assert countlines == 44, countlines

    args = [fa_infile, '-l', '10', '-o', 'fa_outfile']
    (status, out, err) = utils.runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fa_infile))
    assert countlines == 22, countlines
Example #10
0
def test_partition_graph_nojoin_stoptags():
    # test with stoptags
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))

    # add in some stop tags
    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
    stoptags_file = graphbase + '.stoptags'
    ht.save_stop_tags(stoptags_file)
    del ht

    # run script with stoptags option
    script = scriptpath('partition-graph.py')
    args = ['--stoptags', stoptags_file, graphbase]

    utils.runscript(script, args)

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    utils.runscript(script, args)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.load_tagset(graphbase + '.tagset')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (2, 0), x          # should be 2 partitions
Example #11
0
def test_extract_long_sequences():

    script = scriptpath('extract-long-sequences.py')
    fq_infile = utils.get_temp_filename('test.fq')
    fa_infile = utils.get_temp_filename('test.fa')

    shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile)
    shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile)

    fq_outfile = fq_infile + '.keep.fq'
    fa_outfile = fa_infile + '.keep.fa'

    in_dir_fq = os.path.dirname(fq_infile)
    in_dir_fa = os.path.dirname(fa_infile)

    args = [fq_infile, '-l', '10', '-o', 'fq_outfile']
    (status, out, err) = utils.runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fq_infile))
    assert countlines == 44, countlines

    args = [fa_infile, '-l', '10', '-o', 'fa_outfile']
    (status, out, err) = utils.runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fa_infile))
    assert countlines == 22, countlines
Example #12
0
def test_partition_graph_nojoin_stoptags():
    # test with stoptags
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))

    # add in some stop tags
    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
    stoptags_file = graphbase + '.stoptags'
    ht.save_stop_tags(stoptags_file)
    del ht

    # run script with stoptags option
    script = scriptpath('partition-graph.py')
    args = ['--stoptags', stoptags_file, graphbase]

    utils.runscript(script, args)

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    utils.runscript(script, args)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.load_tagset(graphbase + '.tagset')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (2, 0), x  # should be 2 partitions
Example #13
0
def test_make_initial_stoptags():
    # gen input files using load-graph.py -t
    # should keep test_data directory size down
    # or something like that
    # this assumes (obv.) load-graph works properly
    bzinfile = utils.get_temp_filename('test-reads.fq.bz2')
    shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile)
    in_dir = os.path.dirname(bzinfile)

    genscript = scriptpath('load-graph.py')
    genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2']
    utils.runscript(genscript, genscriptargs, in_dir)

    # test input file gen'd by load-graphs
    infile = utils.get_temp_filename('test-reads.pt')
    infile2 = utils.get_temp_filename('test-reads.tagset', in_dir)

    # get file to compare against
    ex_outfile = utils.get_test_data('test-reads.stoptags')

    # actual output file
    outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir)

    script = scriptpath('make-initial-stoptags.py')
    # make-initial-stoptags has weird file argument syntax
    # read the code before modifying
    args = ['test-reads']

    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile1), outfile1
Example #14
0
def test_abund_dist_gz_bigcount():
    infile = utils.get_temp_filename('test.fa')
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    outfile = utils.get_temp_filename('test_ct.gz')
    script = scriptpath('load-into-counting.py')
    htfile = utils.get_temp_filename('test_ct')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = open(htfile, 'rb').read()
    f_out = gzip.open(outfile, 'wb')  # compress the created bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    counting_hash = khmer.load_counting_hash(outfile)
    hashsizes = counting_hash.hashsizes()
    kmer_size = counting_hash.ksize()
    tracking = khmer._Hashbits(kmer_size, hashsizes)
    abundances = counting_hash.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print _, i
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Example #15
0
def test_split_paired_reads_2_fq():
    # test input file
    infile = utils.get_test_data('paired.fq')

    ex_outfile1 = utils.get_test_data('paired.fq.1')
    ex_outfile2 = utils.get_test_data('paired.fq.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fq.1')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired.fq.2', in_dir)

    script = scriptpath('split-paired-reads.py')
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0
Example #16
0
def test_feature_extraction_kmer():

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    shutil.copyfile(utils.get_test_data('example.mito.fasta.shreded.subset'),
                    infile)

    outfile = infile + '.kmer_vector'
    script = scriptpath('feature_extraction_kmer.py')
    mmp = os.path.abspath("../scripts/gm_parameters/par_11.modified")
    print mmp
    tmp = os.path.abspath("./")
    print tmp
    print in_dir
    args = [
        "--input", infile, "--outfile", outfile, "--taxid", "12345", "--label",
        "taxid"
    ]
    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile), outfile
    print outfile
    data = [x.strip() for x in open(outfile)]
    print len(data)
    assert len(data) == 54
    assert data[1].startswith("12345\tgi|511782593|ref|NC_021399.1") == True
    assert data[0].endswith(
        "0.00300180108065\t0.00640384230538\t0.00380228136882") == True
    assert data[-1].startswith(
        "12345\tgi|511782593|ref|NC_021399.1||pos|304493..309493") == True
    assert data[-1].endswith(
        "0.00300180108065\t0.00500300180108\t0.00200120072043") == True
    utils.cleanup()
Example #17
0
def test_filter_stoptags():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    stopfile = utils.get_temp_filename('stoptags', in_dir)

    # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir.
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    # now, create a file with some stop tags in it --
    K = 18
    kh = khmer.new_hashbits(K, 1, 1)
    kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
    kh.save_stop_tags(stopfile)
    del kh

    # finally, run filter-stoptags.
    script = scriptpath('filter-stoptags.py')
    args = ['-k', str(K), stopfile, infile, infile]
    utils.runscript(script, args, in_dir)

    # verify that the basic output file exists
    outfile = infile + '.stopfilt'
    assert os.path.exists(outfile), outfile

    # it should contain only one unique sequence, because we've trimmed
    # off everything after the beginning of the only long sequence in there.
    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 1, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
Example #18
0
def test_extract_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired-mixed.fa.pe')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir)

    script = scriptpath('extract-paired-reads.py')
    args = [infile]

    utils.runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Example #19
0
def test_sample_reads_randomly_S():
    infile = utils.get_temp_filename('test.fq')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('sample-reads-randomly.py')

    # fix random number seed for reproducibility
    args = ['-N', '10', '-R', '1', '-S', '3']

    badargs = list(args)
    badargs.extend(['-o', 'test', 'test.fq', 'test.fq'])
    (status, out, err) = utils.runscript(script, badargs, in_dir, fail_ok=True)
    assert status == -1, (status, out, err)

    args.append('test.fq')

    utils.runscript(script, args, in_dir)

    outfile = infile + '.subset.0'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set([
        '895:1:1:1298:13380', '895:1:1:1347:3237', '895:1:1:1295:6189',
        '895:1:1:1342:11001', '895:1:1:1252:19493', '895:1:1:1318:10532',
        '895:1:1:1314:10430', '895:1:1:1347:8723', '895:1:1:1381:4958',
        '895:1:1:1338:6614'
    ])

    outfile = infile + '.subset.1'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set([
        '895:1:1:1384:20217', '895:1:1:1347:3237', '895:1:1:1348:18672',
        '895:1:1:1290:11501', '895:1:1:1386:7536', '895:1:1:1373:13994',
        '895:1:1:1355:13535', '895:1:1:1303:6251', '895:1:1:1381:4958',
        '895:1:1:1338:6614'
    ])

    outfile = infile + '.subset.2'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set([
        '895:1:1:1326:7273', '895:1:1:1384:20217', '895:1:1:1347:3237',
        '895:1:1:1353:6642', '895:1:1:1340:19387', '895:1:1:1252:19493',
        '895:1:1:1381:7062', '895:1:1:1383:3089', '895:1:1:1342:20695',
        '895:1:1:1303:6251'
    ])
Example #20
0
def test_normalize_by_median_impaired():
    CUTOFF = '1'

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-impaired.fa'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-C', CUTOFF, '-p', '-k', '17', infile]
    utils.runscript(script, args, in_dir, fail_ok=True)
Example #21
0
def test_normalize_by_median_impaired():
    CUTOFF = '1'

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-impaired.fa'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-C', CUTOFF, '-p', '-k', '17', infile]
    utils.runscript(script, args, in_dir, fail_ok=True)
Example #22
0
def test_sample_reads_randomly_S():
    infile = utils.get_temp_filename('test.fq')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('sample-reads-randomly.py')

    # fix random number seed for reproducibility
    args = ['-N', '10', '-R', '1', '-S', '3']

    badargs = list(args)
    badargs.extend(['-o', 'test', 'test.fq', 'test.fq'])
    (status, out, err) = utils.runscript(script, badargs, in_dir, fail_ok=True)
    assert status == -1, (status, out, err)

    args.append('test.fq')

    utils.runscript(script, args, in_dir)

    outfile = infile + '.subset.0'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set(['895:1:1:1298:13380', '895:1:1:1347:3237',
                        '895:1:1:1295:6189', '895:1:1:1342:11001',
                        '895:1:1:1252:19493', '895:1:1:1318:10532',
                        '895:1:1:1314:10430', '895:1:1:1347:8723',
                        '895:1:1:1381:4958', '895:1:1:1338:6614'])

    outfile = infile + '.subset.1'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set(['895:1:1:1384:20217', '895:1:1:1347:3237',
                        '895:1:1:1348:18672', '895:1:1:1290:11501',
                        '895:1:1:1386:7536', '895:1:1:1373:13994',
                        '895:1:1:1355:13535', '895:1:1:1303:6251',
                        '895:1:1:1381:4958', '895:1:1:1338:6614'])

    outfile = infile + '.subset.2'
    assert os.path.exists(outfile), outfile

    seqs = set([r.name for r in screed.open(outfile)])
    print seqs
    assert seqs == set(['895:1:1:1326:7273', '895:1:1:1384:20217',
                        '895:1:1:1347:3237', '895:1:1:1353:6642',
                        '895:1:1:1340:19387', '895:1:1:1252:19493',
                        '895:1:1:1381:7062', '895:1:1:1383:3089',
                        '895:1:1:1342:20695', '895:1:1:1303:6251'])
Example #23
0
def test_trim_low_abund_1_duplicate_filename_err():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    args = ["-k", "17", "-x", "1e7", "-N", "2", '-C', '1', infile, infile]
    try:
        utils.runscript('trim-low-abund.py', args, in_dir, sandbox=True)
        raise Exception("should not reach this")
    except AssertionError:
        # an error should be raised by passing 'infile' twice.
        pass
Example #24
0
def test_trim_low_abund_1_duplicate_filename_err():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    args = ["-k", "17", "-x", "1e7", "-N", "2", '-C', '1', infile, infile]
    try:
        utils.runscript('trim-low-abund.py', args, in_dir, sandbox=True)
        raise Exception("should not reach this")
    except AssertionError:
        # an error should be raised by passing 'infile' twice.
        pass
Example #25
0
def test_normalize_by_median_empty():
    CUTOFF = '1'

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-empty.fa'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-C', CUTOFF, '-k', '17', infile]
    utils.runscript(script, args, in_dir)

    outfile = infile + '.keep'
    assert os.path.exists(outfile), outfile
Example #26
0
def test_normalize_by_median_empty():
    CUTOFF = '1'

    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-empty.fa'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-C', CUTOFF, '-k', '17', infile]
    utils.runscript(script, args, in_dir)

    outfile = infile + '.keep'
    assert os.path.exists(outfile), outfile
Example #27
0
def test_trim_low_abund_4_retain_low_abund():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    args = ["-k", "17", "-x", "1e7", "-N", "2", '-V', infile]
    utils.runscript('trim-low-abund.py', args, in_dir, sandbox=True)

    outfile = infile + '.abundtrim'
    assert os.path.exists(outfile), outfile

    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 2, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs
Example #28
0
def test_trim_low_abund_4_retain_low_abund():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    args = ["-k", "17", "-x", "1e7", "-N", "2", '-V', infile]
    utils.runscript('trim-low-abund.py', args, in_dir, sandbox=True)

    outfile = infile + '.abundtrim'
    assert os.path.exists(outfile), outfile

    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 2, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs
Example #29
0
def _make_counting(infilename, SIZE=1e7, N=2, K=20, BIGCOUNT=True):
    script = scriptpath('load-into-counting.py')
    args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)]

    if not BIGCOUNT:
        args.append('-b')

    outfile = utils.get_temp_filename('out.kh')

    args.extend([outfile, infilename])

    utils.runscript(script, args)
    assert os.path.exists(outfile)

    return outfile
Example #30
0
def _make_counting(infilename, SIZE=1e7, N=2, K=20, BIGCOUNT=True):
    script = scriptpath('load-into-counting.py')
    args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)]

    if not BIGCOUNT:
        args.append('-b')

    outfile = utils.get_temp_filename('out.kh')

    args.extend([outfile, infilename])

    utils.runscript(script, args)
    assert os.path.exists(outfile)

    return outfile
def test_sweep_reads_3():

    infile = utils.get_temp_filename('seqs.fa')
    shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
    wdir = os.path.dirname(infile)
    script = scriptpath('sweep-reads.py')
    args = ['-m', '75', '-k', '20', '-l', '1', '--prefix',
            'test', '--label-by-group', '10', infile, infile]
    status, out, err = utils.runscript(script, args, wdir, sandbox=True)

    for i in xrange(10):
        p = os.path.join(wdir, 'test_{i}.fa'.format(i=i))
        print p, err, out
        assert os.path.exists(p)
        os.remove(p)

    counts_fn = os.path.join(wdir, 'test.counts.csv')
    with open(counts_fn) as cfp:
        for line in cfp:
            _, _, c = line.partition(',')
            assert int(c) in [9, 10]

    assert os.path.exists(counts_fn)
    assert os.path.exists(os.path.join(wdir, 'test.dist.txt'))
    assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
Example #32
0
def test_load_graph():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20', '-t']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    (status, out, err) = utils.runscript(script, args)

    assert 'Total number of k-mers: 3959' in err, err

    ht_file = outfile + '.pt'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
    ht.load_tagset(tagset_file)

    # check to make sure we get the expected result for this data set
    # upon partitioning (all in one partition).  This is kind of a
    # roundabout way of checking that load-graph worked :)
    subset = ht.do_subset_partition(0, 0)
    x = ht.subset_count_partitions(subset)
    assert x == (1, 0), x
Example #33
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = '1'

    infiles = [utils.get_temp_filename('test-0.fq')]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(
            utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-d', '2', '-C', CUTOFF, '-k', '17']
    args.extend(infiles)

    (status, out, err) = utils.runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct'))
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, 'backup.ct'))
    assert out.count('Backup: Saving') == 2
    assert 'Nothing' in out
Example #34
0
def test_do_partition():
    seqfile = utils.get_test_data('random-20-a.fa')
    graphbase = utils.get_temp_filename('out')
    in_dir = os.path.dirname(graphbase)

    script = scriptpath('do-partition.py')
    args = ["-k", "20", graphbase, seqfile]

    utils.runscript(script, args, in_dir)

    partfile = os.path.join(in_dir, 'random-20-a.fa.part')

    parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
    parts = set(parts)
    assert '2' in parts
    assert len(parts) == 1
Example #35
0
def test_count_overlap():
    seqfile1 = utils.get_temp_filename('test-overlap1.fa')
    in_dir = os.path.dirname(seqfile1)
    seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir)
    outfile = utils.get_temp_filename('overlap.out', in_dir)
    curvefile = utils.get_temp_filename('overlap.out.curve', in_dir)
    shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
    shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
    htfile = _make_graph(seqfile1, ksize=20)
    script = scriptpath('count-overlap.py')
    args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000',
            htfile + '.pt', seqfile2, outfile]
    (status, out, err) = utils.runscript(script, args, in_dir)
    assert status == 0
    assert os.path.exists(outfile), outfile
    data = [x.strip() for x in open(outfile)]
    data = set(data)
    assert '# of unique k-mers in dataset2: 759047' in data
    assert '# of overlap unique k-mers: 245621' in data
    assert os.path.exists(curvefile), curvefile
    data = [x.strip() for x in open(curvefile)]
    data = set(data)
    assert '178633 1155' in data
    assert '496285 2970' in data
    assert '752053 238627' in data
Example #36
0
def test_normalize_by_median_force():
    CUTOFF = '1'

    corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
    good_infile = utils.get_temp_filename(
        'test-good.fq', tempdir=os.path.dirname(corrupt_infile))

    in_dir = os.path.dirname(good_infile)

    shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]

    (status, out, err) = utils.runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed')
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0
    assert os.path.exists(corrupt_infile + '.ct.failed')
    assert '*** Skipping' in err
    assert '** IOErrors' in err
Example #37
0
def test_count_overlap():
    seqfile1 = utils.get_temp_filename('test-overlap1.fa')
    in_dir = os.path.dirname(seqfile1)
    seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir)
    outfile = utils.get_temp_filename('overlap.out', in_dir)
    curvefile = utils.get_temp_filename('overlap.out.curve', in_dir)
    shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
    shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
    htfile = _make_graph(seqfile1, ksize=20)
    script = scriptpath('count-overlap.py')
    args = [
        '--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000',
        htfile + '.pt', seqfile2, outfile
    ]
    (status, out, err) = utils.runscript(script, args, in_dir)
    assert status == 0
    assert os.path.exists(outfile), outfile
    data = [x.strip() for x in open(outfile)]
    data = set(data)
    assert '# of unique k-mers in dataset2: 759047' in data
    assert '# of overlap unique k-mers: 245621' in data
    assert os.path.exists(curvefile), curvefile
    data = [x.strip() for x in open(curvefile)]
    data = set(data)
    assert '178633 1155' in data
    assert '496285 2970' in data
    assert '752053 238627' in data
Example #38
0
def test_normalize_by_median_force():
    CUTOFF = '1'

    corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
    good_infile = utils.get_temp_filename('test-good.fq',
                                          tempdir=os.path.dirname(
                                              corrupt_infile))

    in_dir = os.path.dirname(good_infile)

    shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]

    (status, out, err) = utils.runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed')
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0
    assert os.path.exists(corrupt_infile + '.ct.failed')
    assert '*** Skipping' in err
    assert '** IOErrors' in err
Example #39
0
def test_load_graph():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20', '-t']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    (status, out, err) = utils.runscript(script, args)

    assert 'Total number of k-mers: 3959' in err, err

    ht_file = outfile + '.pt'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
    ht.load_tagset(tagset_file)

    # check to make sure we get the expected result for this data set
    # upon partitioning (all in one partition).  This is kind of a
    # roundabout way of checking that load-graph worked :)
    subset = ht.do_subset_partition(0, 0)
    x = ht.subset_count_partitions(subset)
    assert x == (1, 0), x
Example #40
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = '1'

    infiles = [utils.get_temp_filename('test-0.fq')]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x),
                                               tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-d', '2', '-C', CUTOFF, '-k', '17']
    args.extend(infiles)

    (status, out, err) = utils.runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct'))
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, 'backup.ct'))
    assert out.count('Backup: Saving') == 2
    assert 'Nothing' in out
Example #41
0
def test_sweep_reads_3():

    infile = utils.get_temp_filename('seqs.fa')
    shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
    wdir = os.path.dirname(infile)
    script = scriptpath('sweep-reads.py')
    args = ['-m', '75', '-k', '20', '-l', '1', '--prefix',
            'test', '--label-by-group', '10', infile, infile]
    status, out, err = utils.runscript(script, args, wdir, sandbox=True)

    for i in xrange(10):
        p = os.path.join(wdir, 'test_{i}.fa'.format(i=i))
        print p, err, out
        assert os.path.exists(p)
        os.remove(p)

    counts_fn = os.path.join(wdir, 'test.counts.csv')
    with open(counts_fn) as cfp:
        for line in cfp:
            _, _, c = line.partition(',')
            assert int(c) in [9, 10]

    assert os.path.exists(counts_fn)
    assert os.path.exists(os.path.join(wdir, 'test.dist.txt'))
    assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
Example #42
0
def test_do_partition():
    seqfile = utils.get_test_data('random-20-a.fa')
    graphbase = utils.get_temp_filename('out')
    in_dir = os.path.dirname(graphbase)

    script = scriptpath('do-partition.py')
    args = ["-k", "20", graphbase, seqfile]

    utils.runscript(script, args, in_dir)

    partfile = os.path.join(in_dir, 'random-20-a.fa.part')

    parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
    parts = set(parts)
    assert '2' in parts
    assert len(parts) == 1
Example #43
0
def test_abundance_dist_single_nobigcount():
    infile = utils.get_temp_filename('test.fa')
    outfile = utils.get_temp_filename('test.dist')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    script = scriptpath('abundance-dist-single.py')
    args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile]
    utils.runscript(script, args, in_dir)

    fp = iter(open(outfile))
    line = fp.next().strip()
    assert line == '1 96 96 0.98', line
    line = fp.next().strip()
    assert line == '255 2 98 1.0', line
Example #44
0
def test_normalize_by_median_indent():
    infile = utils.get_test_data('paired-mixed.fa.pe')
    hashfile = utils.get_test_data('normC20k20.kh')
    script = scriptpath('normalize-by-median.py')
    args = ['--loadtable', hashfile, infile]
    (status, out, err) = utils.runscript(script, args)
    assert status == 0, (out, err)
    print(out, err)
Example #45
0
def test_extract_partitions_no_groups():
    empty_file = utils.get_temp_filename('empty-file')
    basefile = utils.get_test_data('empty-file')

    shutil.copyfile(basefile, empty_file)
    in_dir = os.path.dirname(empty_file)

    # ok, now run extract-partitions.
    script = scriptpath('extract-partitions.py')
    args = ['extracted', empty_file]

    utils.runscript(script, args, in_dir, fail_ok=True)

    # No group files should be created
    groupfile = os.path.join(in_dir, 'extracted.group0000.fa')

    assert not os.path.exists(groupfile)
Example #46
0
def test_filter_abund_2():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    counting_ht = _make_counting(infile, K=17)

    script = scriptpath('filter-abund.py')
    args = ['-C', '1', counting_ht, infile, infile]
    utils.runscript(script, args, in_dir)

    outfile = infile + '.abundfilt'
    assert os.path.exists(outfile), outfile

    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 2, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs
Example #47
0
def test_load_graph_multithread():
    script = scriptpath('load-graph.py')

    outfile = utils.get_temp_filename('test')
    infile = utils.get_test_data('test-reads.fa')

    args = ['-N', '4', '-x', '1e9', '-T', '8', outfile, infile]

    (status, out, err) = utils.runscript(script, args)
Example #48
0
def test_count_median():
    infile = utils.get_temp_filename('test.fa')
    outfile = infile + '.counts'

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    counting_ht = _make_counting(infile, K=8)

    script = scriptpath('count-median.py')
    args = [counting_ht, infile, outfile]
    utils.runscript(script, args)

    assert os.path.exists(outfile), outfile

    data = [x.strip() for x in open(outfile)]
    data = set(data)
    assert len(data) == 2, data
    assert 'seq 1001 1001.0 0.0 18' in data
    assert '895:1:37:17593:9954/1 1 103.803741455 303.702941895 114' in data
Example #49
0
def test_abundance_dist_nobigcount():
    infile = utils.get_temp_filename('test.fa')
    outfile = utils.get_temp_filename('test.dist')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    htfile = _make_counting(infile, K=17, BIGCOUNT=False)

    script = scriptpath('abundance-dist.py')
    args = ['-z', htfile, infile, outfile]
    utils.runscript(script, args, in_dir)

    fp = iter(open(outfile))
    line = fp.next().strip()
    assert line == '1 96 96 0.98', line
    line = fp.next().strip()
    assert line == '255 2 98 1.0', line
Example #50
0
def test_load_graph_no_tags():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20', '-n']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    utils.runscript(script, args)

    ht_file = outfile + '.pt'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert not os.path.exists(tagset_file), tagset_file

    assert khmer.load_hashbits(ht_file)