Esempio n. 1
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    htfile = args.countgraph
    input_filename = args.input
    output = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_input_files(infile, args.force)

    check_space(infiles, args.force)

    print('loading k-mer countgraph from', htfile, file=sys.stderr)
    countgraph = Countgraph.load(htfile)
    ksize = countgraph.ksize()
    print('writing to', output.name, file=sys.stderr)

    output = csv.writer(output)
    # write headers:
    output.writerow(['name', 'median', 'average', 'stddev', 'seqlen'])

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'A')

        if ksize <= len(seq):
            medn, ave, stdev = countgraph.get_median_count(seq)
            ave, stdev = [round(x, 9) for x in (ave, stdev)]
            output.writerow([record.name, medn, ave, stdev, len(seq)])
Esempio n. 2
0
def test_abund_dist_gz_bigcount_compressed_first():
    infile = utils.copy_test_data('test-abund-read-2.fa')
    script = 'load-into-counting.py'
    htfile = utils.get_temp_filename('test_ct.gz')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = gzip.open(htfile, 'rb').read()  # read compressed bigcount table

    outfile = utils.get_temp_filename('test_ct')
    f_out = open(outfile, 'wb')  # output the bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    try:
        countgraph = Countgraph.load(outfile)
    except OSError as err:
        assert 0, 'Should not produce OSError: ' + str(err)

    assert countgraph.n_occupied() != 0
    hashsizes = countgraph.hashsizes()
    kmer_size = countgraph.ksize()
    tracking = khmer.Nodegraph(kmer_size, 1, 1, primes=hashsizes)
    abundances = countgraph.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print(_, i)
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Esempio n. 3
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer.Countgraph(12, 1, 1, primes=sizes)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    try:
        ht = Countgraph.load(loadpath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    tracking = khmer.Nodegraph(12, 1, 1, primes=sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer.Nodegraph(12, 1, 1, primes=sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Esempio n. 4
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print('file with ht: %s' % counting_ht)

    print('making hashtable')
    ht = Countgraph.load(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        paired_iter = broken_paired_reader(ReadParser(infile), min_length=K,
                                           force_single=True)
        for n, is_pair, read1, read2 in paired_iter:
            name = read1.name
            seq = read1.sequence
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                write_record(screed.Record(name=name, sequence=trim_seq), outfp)
Esempio n. 5
0
def test_load_into_counting_1():
    in1 = utils.get_test_data('test-abund-read-2.fa')
    out1 = utils.get_temp_filename('out.ct')

    cmd = """
       cat {in1} |
       {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \
       2> /dev/null
    """

    cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
    print(cmd)

    run_shell_cmd(cmd)
    assert os.path.exists(out1)
    Countgraph.load(out1)
Esempio n. 6
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer.Countgraph(12, 1, 1, primes=sizes)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    try:
        ht = Countgraph.load(loadpath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    tracking = khmer.Nodegraph(12, 1, 1, primes=sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer.Nodegraph(12, 1, 1, primes=sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Esempio n. 7
0
def test_badload():

    try:
        countgraph = Countgraph.load()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
Esempio n. 8
0
def test_abund_dist_gz_bigcount_compressed_first():
    infile = utils.copy_test_data('test-abund-read-2.fa')
    script = 'load-into-counting.py'
    htfile = utils.get_temp_filename('test_ct.gz')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = gzip.open(htfile, 'rb').read()  # read compressed bigcount table

    outfile = utils.get_temp_filename('test_ct')
    f_out = open(outfile, 'wb')  # output the bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    try:
        countgraph = Countgraph.load(outfile)
    except OSError as err:
        assert 0, 'Should not produce OSError: ' + str(err)

    assert countgraph.n_occupied() != 0
    hashsizes = countgraph.hashsizes()
    kmer_size = countgraph.ksize()
    tracking = khmer.Nodegraph(kmer_size, 1, 1, primes=hashsizes)
    abundances = countgraph.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print(_, i)
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Esempio n. 9
0
def test_badload():

    try:
        countgraph = Countgraph.load()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
Esempio n. 10
0
def test_load_into_counting_1():
    in1 = utils.get_test_data('test-abund-read-2.fa')
    out1 = utils.get_temp_filename('out.ct')

    cmd = """
       cat {in1} |
       {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \
       2> /dev/null
    """

    cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
    print(cmd)

    run_shell_cmd(cmd)
    assert os.path.exists(out1)
    Countgraph.load(out1)
Esempio n. 11
0
def main():
    info('count-kmers.py', ['counting'])
    args = get_parser().parse_args()

    print ('hashtable from', args.input_count_graph_filename,
           file=sys.stderr)
    countgraph = Countgraph.load(
        args.input_count_graph_filename)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer.Nodegraph(  # pylint: disable=protected-access
        kmer_size, 1, 1, primes=hashsizes)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    for filename in args.input_sequence_filenames:
        for record in screed.open(filename):
            seq = record.sequence.replace('N', 'A')
            for i in range(len(seq) - kmer_size + 1):
                kmer = seq[i:i+kmer_size]
                if not tracking.get(kmer):
                    tracking.count(kmer)
                    writer.writerow([kmer, str(countgraph.get(kmer))])

    print ('Total number of unique k-mers: {0}'.format(
        countgraph.n_unique_kmers()), file=sys.stderr)
Esempio n. 12
0
def test_counting_gz_file_type_check():
    inpath = utils.get_test_data('goodversion-k12.ht.gz')

    try:
        kh = Countgraph.load(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Esempio n. 13
0
def test_load_gz_notexist_should_fail():
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    try:
        hi = Countgraph.load(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Esempio n. 14
0
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('tempnodegraphsave0.htable')

    try:
        hi = Countgraph.load(savepath)
        assert 0, "load should fail"
    except OSError:
        pass
Esempio n. 15
0
def test_counting_gz_file_type_check():
    inpath = utils.get_test_data('goodversion-k12.ht.gz')

    try:
        kh = Countgraph.load(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Esempio n. 16
0
def test_load_gz_notexist_should_fail():
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    try:
        hi = Countgraph.load(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Esempio n. 17
0
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('tempnodegraphsave0.htable')

    try:
        hi = Countgraph.load(savepath)
        assert 0, "load should fail"
    except OSError:
        pass
Esempio n. 18
0
def test_load_truncated():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('save.ht')
    truncpath = utils.get_temp_filename('trunc.ht')

    hi = khmer.Countgraph(12, 200, 3)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        fp = open(truncpath, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            Countgraph.load(truncpath)
            assert 0, "this should not be reached!"
        except OSError as err:
            print(str(err))
Esempio n. 19
0
def test_load_truncated():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('save.ht')
    truncpath = utils.get_temp_filename('trunc.ht')

    hi = khmer.Countgraph(12, 200, 3)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        fp = open(truncpath, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            Countgraph.load(truncpath)
            assert 0, "this should not be reached!"
        except OSError as err:
            print(str(err))
Esempio n. 20
0
def main():
    parser = khmer_args.build_counting_args(
        "Correct reads against an already-computed table",
        citations=['counting', 'SeqAn'])

    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
    parser.add_argument('-o',
                        '--output',
                        dest='output_file',
                        help="output file for histogram; defaults to "
                        "<first filename>.corr in cwd.",
                        type=khFileType('w'),
                        default=None)

    parser.add_argument('counts_table')
    parser.add_argument('readfile')

    args = parser.parse_args()

    print('loading counts')
    ht = Countgraph.load(args.counts_table)

    aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta)

    print("trusted:", args.trusted_cov)

    corrfp = args.output_file
    if not corrfp:
        outfile = os.path.basename(args.readfile) + '.corr'
        corrfp = open(outfile, 'w')

    n_corrected = 0
    for n, read in enumerate(screed.open(args.readfile)):
        if n % 10000 == 0:
            print('...', n, n_corrected, file=sys.stderr)
        seq = read.sequence.replace('N', 'A')

        # build the alignment...
        score, graph_alignment, read_alignment, truncated = \
            aligner.align(seq)

        if not truncated:
            graph_seq = graph_alignment.replace("-", "")
            if graph_seq != seq:
                n_corrected += 1

            seq = graph_seq

        corrfp.write(output_single(read, seq))
Esempio n. 21
0
def test_save_load_large(ctfile):
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    orig = khmer.Countgraph(12, 2**31, 1)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = Countgraph.load(savepath)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3966, orig_count
    assert loaded_count == orig_count, loaded_count
Esempio n. 22
0
def main():
    parser = khmer_args.build_counting_args(
        "Correct reads against an already-computed table",
        citations=['counting', 'SeqAn'])

    parser.add_argument("--trusted-cov", dest="trusted_cov", type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
    parser.add_argument('-o', '--output', dest='output_file',
                        help="output file for histogram; defaults to "
                             "<first filename>.corr in cwd.",
                        type=khFileType('w'), default=None)

    parser.add_argument('counts_table')
    parser.add_argument('readfile')

    args = parser.parse_args()

    print('loading counts')
    ht = Countgraph.load(args.counts_table)

    aligner = khmer.ReadAligner(ht,
                                args.trusted_cov,
                                args.bits_theta)

    print("trusted:", args.trusted_cov)

    corrfp = args.output_file
    if not corrfp:
        outfile = os.path.basename(args.readfile) + '.corr'
        corrfp = open(outfile, 'w')

    n_corrected = 0
    for n, read in enumerate(screed.open(args.readfile)):
        if n % 10000 == 0:
            print('...', n, n_corrected, file=sys.stderr)
        seq = read.sequence.replace('N', 'A')

        # build the alignment...
        score, graph_alignment, read_alignment, truncated = \
            aligner.align(seq)

        if not truncated:
            graph_seq = graph_alignment.replace("-", "")
            if graph_seq != seq:
                n_corrected += 1

            seq = graph_seq

        corrfp.write(output_single(read, seq))
Esempio n. 23
0
def test_save_load_large(ctfile):
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    orig = khmer.Countgraph(12, 2**31, 1)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = Countgraph.load(savepath)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3966, orig_count
    assert loaded_count == orig_count, loaded_count
Esempio n. 24
0
def test_save_load_occupied(ctfile):
    print('working with', ctfile)
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    orig = khmer.Countgraph(12, 1e5, 4)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = Countgraph.load(savepath)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3886, orig_count
    assert loaded_count == orig_count, loaded_count
Esempio n. 25
0
def test_save_load_occupied(ctfile):
    print('working with', ctfile)
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    orig = khmer.Countgraph(12, 1e5, 4)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = Countgraph.load(savepath)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3886, orig_count
    assert loaded_count == orig_count, loaded_count
Esempio n. 26
0
def test_maxcount_with_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4 ** 4, 4)
    kh.set_use_bigcount(True)

    for _ in range(0, 1000):
        kh.count('AAAA')
        c = kh.get('AAAA')

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = Countgraph.load(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    c = kh.get('AAAA')
    assert c == 1000, "should be able to count to 1000: %d" % c
    assert c != MAX_COUNT, c
Esempio n. 27
0
def test_maxcount_with_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4**4, 4)
    kh.set_use_bigcount(True)

    for _ in range(0, 1000):
        kh.count('AAAA')
        c = kh.get('AAAA')

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = Countgraph.load(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    c = kh.get('AAAA')
    assert c == 1000, "should be able to count to 1000: %d" % c
    assert c != MAX_COUNT, c
Esempio n. 28
0
def test_nobigcount_save():
    kh = khmer.Countgraph(4, 4 ** 4, 4)
    # kh.set_use_bigcount(False) <-- this is the default

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = Countgraph.load(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    # set_use_bigcount should still be False after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for _ in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == MAX_COUNT
Esempio n. 29
0
def test_load_gz_truncated_should_fail():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    hi = khmer.Countgraph(12, 1000, 2)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    fp = open(savepath, 'rb')
    data = fp.read()
    fp.close()

    fp = open(savepath, 'wb')
    fp.write(data[:1000])
    fp.close()

    try:
        hi = Countgraph.load(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Esempio n. 30
0
def test_load_gz_truncated_should_fail():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    hi = khmer.Countgraph(12, 1000, 2)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    fp = open(savepath, 'rb')
    data = fp.read()
    fp.close()

    fp = open(savepath, 'wb')
    fp.write(data[:1000])
    fp.close()

    try:
        hi = Countgraph.load(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Esempio n. 31
0
def test_nobigcount_save():
    kh = khmer.Countgraph(4, 4**4, 4)
    # kh.set_use_bigcount(False) <-- this is the default

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = Countgraph.load(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    # set_use_bigcount should still be False after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for _ in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == MAX_COUNT
Esempio n. 32
0
def test_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4 ** 4, 4)
    kh.set_use_bigcount(True)

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = Countgraph.load(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    # set_use_bigcount should still be True after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for _ in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == 1000
Esempio n. 33
0
def test_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4**4, 4)
    kh.set_use_bigcount(True)

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = Countgraph.load(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    # set_use_bigcount should still be True after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for _ in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == 1000
Esempio n. 34
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer.Countgraph(12, 1, 1, primes=sizes)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    try:
        ht = Countgraph.load(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    tracking = khmer.Nodegraph(12, 1, 1, primes=sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer.Nodegraph(12, 1, 1, primes=sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Esempio n. 35
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer.Countgraph(12, 1, 1, primes=sizes)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    try:
        ht = Countgraph.load(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    tracking = khmer.Nodegraph(12, 1, 1, primes=sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer.Nodegraph(12, 1, 1, primes=sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Esempio n. 36
0
def main():
    info('correct-reads.py', ['streaming'])
    args = sanitize_help(get_parser()).parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    tablesize = calculate_graphsize(args, 'countgraph')

    if args.savegraph:
        check_space_for_graph(args.savegraph, tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr)
        ct = Countgraph.load(args.loadgraph)
    else:
        print('making k-mer countgraph', file=sys.stderr)
        ct = create_countgraph(args, multiplier=8 / (9. + 0.3))
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; use -T to change location' %
          tempdir,
          file=sys.stderr)

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...',
                      n,
                      filename,
                      save_pass2,
                      n_reads,
                      n_bp,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print(('second pass: looking at sequences kept aside in %s') %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(
                screed.open(pass2filename, parse_description=False)):
            if n % 10000 == 0:
                print('... x 2',
                      n,
                      pass2filename,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (
        n_reads,
        n_bp,
    ), file=sys.stderr)
    print('wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    ),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' %
          (save_pass2_total, n_passes),
          file=sys.stderr)
    print('removed %d reads and corrected %d reads (%.2f%%)' %
          (n_reads - written_reads, corrected_reads, percent_reads_corrected),
          file=sys.stderr)
    print('removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' %
              (n_reads - skipped_n, percent_reads_hicov),
              file=sys.stderr)
        print(('skipped %d reads/%d bases because of low coverage') %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.corr', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)
Esempio n. 37
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    graphtype = 'countgraph' if not args.small_count else 'smallcountgraph'
    report_on_config(args, graphtype=graphtype)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, graphtype)
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        if args.small_count:
            ct = SmallCountgraph.load(args.loadgraph)
        else:
            ct = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info(
        'created temporary directory {temp};\n'
        'use -T to change location',
        temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        paired_iter = broken_paired_reader(ReadParser(filename),
                                           min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info(
                    "... {filename} {n_saved} {n_reads} {n_bp} "
                    "{w_reads} {w_bp}",
                    filename=filename,
                    n_saved=trimmer.n_saved,
                    n_reads=trimmer.n_reads,
                    n_bp=trimmer.n_bp,
                    w_reads=written_reads,
                    w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename,
                 kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        read_parser = ReadParser(pass2filename)
        paired_iter = broken_paired_reader(read_parser,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename,
                         c=trimmer.n_saved,
                         d=trimmer.n_reads,
                         e=trimmer.n_bp,
                         f=written_reads,
                         g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        read_parser.close()

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    try:
        log_info('removing temp directory & contents ({temp})', temp=tempdir)
        shutil.rmtree(tempdir)
    except OSError as oe:
        log_info(
            'WARNING: unable to remove {temp} (probably an NFS issue); '
            'please remove manually',
            temp=tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total,
             np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads,
             t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0,
             bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped,
                 p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped,
                 bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.output is None:
        log_info('output in *.abundtrim')
    elif args.output.name == 1:
        log_info('output streamed to stdout')
    elif args.output.name:
        log_info('output in {}'.format(args.output.name))

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)

    if args.summary_info is not None:
        # note that when streaming to stdout the name of args.output will
        # be set to 1
        if args.output is not None and args.output.name != 1:
            base = args.output.name
        # no explicit name or stdout stream -> use a default name
        else:
            base = 'trim-low-abund-{}'.format(
                time.strftime("%Y-%m-%dT%H:%M:%S"))

        info = {
            'fpr': fp_rate,
            'reads': n_reads,
            'basepairs': n_bp,
            'reads_written': written_reads,
            'basepairs_written': written_bp,
            'reads_skipped': n_skipped,
            'basepairs_skipped': bp_skipped,
            'reads_removed': n_reads - written_reads,
            'reads_trimmed': trimmed_reads,
            'basepairs_removed_or_trimmed': n_bp - written_bp
        }
        store_provenance_info(info, fname=base, format=args.summary_info)
Esempio n. 38
0
 def _get_contaminants(self):
     from khmer import Countgraph, khmer_args
     # assuming all sequences are same length
     n_win = self._read_length - self.kmer_size + 1
     tablesize = self.n_reads * n_win
     countgraph = Countgraph(
         self.kmer_size, tablesize, khmer_args.DEFAULT_N_TABLES)
     countgraph.set_use_bigcount(True)
     
     for seq in self._read_sequences:
         countgraph.consume_and_tag(seq)
     
     n_expected = math.ceil(tablesize / float(4**self.kmer_size))
     min_count = n_expected * self.overrep_cutoff
     if min_count >= 2**16:
         raise ValueError(
             "The minimum count for an over-represented k-kmer {} is "
             "greater than the max khmer count (2^16)".format(min_count))
 
     candidates = {}
     
     for tag in countgraph.get_tagset():
         count = countgraph.get(tag)
         if count >= min_count:
             candidates[tag] = count
     
     if self.known_contaminants:
         matches = []
         seen = set()
         
         def match(kmer):
             """Returns the frequency of `kmer` in `candidates`.
             """
             freq = candidates.get(kmer, 0)
             if freq > 0:
                 seen.add(kmer)
             return freq
         
         for seq, names in self.known_contaminants.iter_sequences():
             seqlen = len(seq)
             if seqlen < self.kmer_size:
                 print("Cannot check {}; sequence is shorter than {}".format(
                     list(names)[0], self.kmer_size))
                 continue
             
             n_kmers = seqlen - self.kmer_size + 1
             num_matches = 0
             match_counts = []
             for idx in range(n_kmers):
                 kmer = seq[idx:(idx + self.kmer_size)]
                 kmer_count = max(
                     match(kmer),
                     match(reverse_complement(kmer))
                 )
                 if kmer_count > 0:
                     num_matches += 1
                     match_counts.append(kmer_count)
             
             if num_matches > 0:
                 # not sure what the correct metric is to use here
                 overall_count = sum(match_counts) / float(n_kmers)
                 matches.append(Match(
                     seq, count=overall_count / float(tablesize), 
                     names=names, match_frac=float(num_matches) / n_kmers))
         
         # Add remaining tags
         for tag in set(candidates.keys()) - seen:
             matches.append(Match(
                 tag, count=candidates[tag] / float(tablesize)))
     
     else:
         matches = [
             Match(tag, count=count / float(tablesize))
             for tag, count in candidates.items()]
     
     return matches
Esempio n. 39
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    graphtype = 'countgraph' if not args.small_count else 'smallcountgraph'
    report_on_config(args, graphtype=graphtype)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, graphtype)
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        if args.small_count:
            ct = SmallCountgraph.load(args.loadgraph)
        else:
            ct = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info('created temporary directory {temp};\n'
             'use -T to change location', temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        paired_iter = broken_paired_reader(ReadParser(filename), min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info("... {filename} {n_saved} {n_reads} {n_bp} "
                         "{w_reads} {w_bp}", filename=filename,
                         n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
                         n_bp=trimmer.n_bp, w_reads=written_reads,
                         w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename, kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        read_parser = ReadParser(pass2filename)
        paired_iter = broken_paired_reader(read_parser,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename, c=trimmer.n_saved,
                         d=trimmer.n_reads, e=trimmer.n_bp,
                         f=written_reads, g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        read_parser.close()

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    try:
        log_info('removing temp directory & contents ({temp})', temp=tempdir)
        shutil.rmtree(tempdir)
    except OSError as oe:
        log_info('WARNING: unable to remove {temp} (probably an NFS issue); '
                 'please remove manually', temp=tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total, np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads, t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped, p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped, bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.output is None:
        log_info('output in *.abundtrim')
    elif args.output.name == 1:
        log_info('output streamed to stdout')
    elif args.output.name:
        log_info('output in {}'.format(args.output.name))

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)

    if args.summary_info is not None:
        # note that when streaming to stdout the name of args.output will
        # be set to 1
        if args.output is not None and args.output.name != 1:
            base = args.output.name
        # no explicit name or stdout stream -> use a default name
        else:
            base = 'trim-low-abund-{}'.format(
                time.strftime("%Y-%m-%dT%H:%M:%S"))

        info = {'fpr': fp_rate,
                'reads': n_reads,
                'basepairs': n_bp,
                'reads_written': written_reads,
                'basepairs_written': written_bp,
                'reads_skipped': n_skipped,
                'basepairs_skipped': bp_skipped,
                'reads_removed': n_reads - written_reads,
                'reads_trimmed': trimmed_reads,
                'basepairs_removed_or_trimmed': n_bp - written_bp
                }
        store_provenance_info(info, fname=base, format=args.summary_info)
Esempio n. 40
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = [args.input_count_graph_filename, args.input_sequence_filename]
    for infile in infiles:
        check_input_files(infile, False)

    log_info('Loading counting graph from {graph}',
             graph=args.input_count_graph_filename)
    countgraph = Countgraph.load(args.input_count_graph_filename)

    if not countgraph.get_use_bigcount() and args.bigcount:
        log_warn("WARNING: The loaded graph has bigcount DISABLED while "
                 "bigcount reporting is ENABLED--counts higher than 255 will "
                 "not be reported.")

    countgraph.set_use_bigcount(args.bigcount)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer.Nodegraph(  # pylint: disable=protected-access
        kmer_size, 1, 1, primes=hashsizes)

    log_info('K: {ksize}', ksize=kmer_size)
    log_info('outputting to {output}', output=args.output_histogram_filename)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        pass
    elif os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            log_error('ERROR: {output} exists; not squashing.',
                      output=args.output_histogram_filename)
            sys.exit(1)

        log_info('** squashing existing file {output}',
                 output=args.output_histogram_filename)

    log_info('preparing hist...')
    abundances = countgraph.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        countgraph_fp = sys.stdout
    else:
        countgraph_fp = open(args.output_histogram_filename, 'w')
    countgraph_fp_csv = csv.writer(countgraph_fp)
    # write headers:
    countgraph_fp_csv.writerow(
        ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break
Esempio n. 41
0
def test_load_empty_files(ext):
    # Check empty files, compressed or not
    fname = utils.get_test_data('empty-file' + ext)
    with pytest.raises(OSError):
        Countgraph.load(fname)
Esempio n. 42
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph)
        countgraph = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
Esempio n. 43
0
def test_load_empty_files(ext):
    # Check empty files, compressed or not
    fname = utils.get_test_data('empty-file' + ext)
    with pytest.raises(OSError):
        Countgraph.load(fname)
Esempio n. 44
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    start_time = time.time()
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph1 = Countgraph.load(args.loadgraph)

    # load second counting table.
    if args.loadgraph2:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph2)
        countgraph2 = Countgraph.load(args.loadgraph2)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        screed_iter = clean_input_reads(screed.open(filename))
        reader = broken_paired_reader(screed_iter,
                                      min_length=args.ksize,
                                      force_single=force_single,
                                      require_paired=require_paired)

        # actually do diginorm
        for _, is_paired, read0, read1 in reader:
            for record in snarf(is_paired, read0, read1, countgraph1,
                                countgraph2):
                if record is not None:
                    write_record(record, outfp)

    print("--- %s seconds ---" % (time.time() - start_time))
Esempio n. 45
0
def main():
    info('correct-reads.py', ['streaming'])
    args = sanitize_help(get_parser()).parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    tablesize = calculate_graphsize(args, 'countgraph')

    if args.savegraph:
        check_space_for_graph(args.savegraph, tablesize,
                              args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr)
        ct = Countgraph.load(args.loadgraph)
    else:
        print('making k-mer countgraph', file=sys.stderr)
        ct = create_countgraph(args, multiplier=8 / (9. + 0.3))
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; use -T to change location'
          % tempdir, file=sys.stderr)

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...', n, filename, save_pass2, n_reads, n_bp,
                      written_reads, written_bp, file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s'
              % (filename, save_pass2, n, filename), file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print(('second pass: looking at sequences kept aside in %s') %
              pass2filename, file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename,
                                             parse_description=False)):
            if n % 10000 == 0:
                print('... x 2', n, pass2filename, written_reads,
                      written_bp, file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr)
    print('wrote %d reads, %d bp' % (written_reads, written_bp,),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' %
          (save_pass2_total, n_passes), file=sys.stderr)
    print('removed %d reads and corrected %d reads (%.2f%%)' %
          (n_reads - written_reads, corrected_reads, percent_reads_corrected),
          file=sys.stderr)
    print('removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' %
              (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr)
        print(('skipped %d reads/%d bases because of low coverage')
              % (skipped_n, skipped_bp), file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.corr', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)