Esempio n. 1
0
def test_readaligner_load():
    ct = khmer.Countgraph(32, 1048576, 1)
    parameters_json = utils.get_test_data('readaligner-default.json')
    a_aligner = khmer.ReadAligner(ct, 0, 0, filename=parameters_json)
    a_scoring_matrix = a_aligner.get_scoring_matrix()
    a_transition_probabilities = a_aligner.get_transition_probabilities()
    assert a_scoring_matrix[0] == -0.06642736173897607, a_scoring_matrix[0]
    assert a_transition_probabilities[0][0] == -0.021973842014145723, (
        a_transition_probabilities[0][0])

    for seq in ht_seqs:
        ct.consume(seq)

    for query in queries:
        a_aligner.align(query['seq'])

    b_aligner = khmer.ReadAligner(
        ct,
        0,
        0,
        transition_probabilities=a_transition_probabilities,
        scoring_matrix=a_scoring_matrix)
    b_scoring_matrix = b_aligner.get_scoring_matrix()
    b_transition_probabilities = b_aligner.get_transition_probabilities()
    assert b_scoring_matrix == a_scoring_matrix, (a_scoring_matrix,
                                                  b_scoring_matrix)
    assert b_transition_probabilities == a_transition_probabilities, (
        a_transition_probabilities, b_transition_probabilities)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.ReadAligner(ct, 5, 1.0)

    for record in screed.open(args.ref):
        s = record.sequence
        s = s.replace('N', 'A')

        score, graph_alignment, read_alignment, truncated = \
               aligner.align(s)

        assert not truncated

        g = graph_alignment.replace('-', '')
        r = read_alignment.replace('-', '')

        print record.name
        for kstart in range(0, len(g) - ct.ksize() + 1):
            kmer = g[kstart:kstart + ct.ksize()]
            print kstart, ct.get(kmer)
Esempio n. 3
0
def test_readalign_new(query):
    ch = khmer.Countgraph(32, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 1, 0)
    for seq in ht_seqs:
        ch.consume(seq)

    check_query(aligner, query)
Esempio n. 4
0
def test_readalign_new():
    return  # @CTB
    ch = khmer.Countgraph(32, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 1, 0)
    for seq in ht_seqs:
        ch.consume(seq)

    for query in queries:
        if "description" in query:
            check_query.description = query["description"]
        yield check_query, aligner, query
Esempio n. 5
0
def test_alignnocov():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    read = "ACCTAGGTTCGACATGTACC"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACCTAGGTTCGACATGTACC")
    score, graphAlign, readAlign, trunc = aligner.align(read)

    # should be the same
    eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
    eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
Esempio n. 6
0
def main():
    parser = khmer_args.build_counting_args(
        "Correct reads against an already-computed table",
        citations=['counting', 'SeqAn'])

    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
    parser.add_argument('-o',
                        '--output',
                        dest='output_file',
                        help="output file for histogram; defaults to "
                        "<first filename>.corr in cwd.",
                        type=khFileType('w'),
                        default=None)

    parser.add_argument('counts_table')
    parser.add_argument('readfile')

    args = parser.parse_args()

    print('loading counts')
    ht = Countgraph.load(args.counts_table)

    aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta)

    print("trusted:", args.trusted_cov)

    corrfp = args.output_file
    if not corrfp:
        outfile = os.path.basename(args.readfile) + '.corr'
        corrfp = open(outfile, 'w')

    n_corrected = 0
    for n, read in enumerate(screed.open(args.readfile)):
        if n % 10000 == 0:
            print('...', n, n_corrected, file=sys.stderr)
        seq = read.sequence.replace('N', 'A')

        # build the alignment...
        score, graph_alignment, read_alignment, truncated = \
            aligner.align(seq)

        if not truncated:
            graph_seq = graph_alignment.replace("-", "")
            if graph_seq != seq:
                n_corrected += 1

            seq = graph_seq

        corrfp.write(output_single(read, seq))
Esempio n. 7
0
def test_align_middle():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "TCGACAAGTCCTTGACAGAT"
    aligner = khmer.ReadAligner(ch, trusted_cov_cutoff=0, bits_theta=0)
    for _ in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume(read)
    _, graphAlign, readAlign, trunc = aligner.align(read)

    # should be the same
    eq_(readAlign, read)
    eq_(graphAlign, read)
    assert not trunc
Esempio n. 8
0
def test_align_fwd_middle():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "TCGACAAGTCCTTGACAGAT"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for _ in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume(read)
    score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)

    # should be the same
    eq_(readAlign, read)
    eq_(graphAlign, read)
    assert not trunc
Esempio n. 9
0
def test_readalign_new():
    ch = khmer.new_counting_hash(32, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 1, 0)
    for seq in ht_seqs:
        ch.consume(seq)

    for query in queries:
        score, graphAlign, readAlign, trunc = aligner.align(query["seq"])
        print graphAlign
        print readAlign
        eq_(graphAlign, query["graph_aln"])
        eq_(readAlign, query["read_aln"])
        eq_(trunc, query["truncated"])
Esempio n. 10
0
def test_alignnocov():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "ACCTAGGTTCGACATGTACC"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for _ in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACCTAGGTTCGACATGTACC")
    _, graphAlign, readAlign, trunc = aligner.align(read)

    # should be the same
    eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
    eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
    assert not trunc
Esempio n. 11
0
def test_readalign():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 1, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    read = "ACCTAGGTTCGACATGTACC"
    #                      ^^            ^  ^

    ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG")

    score, graphAlign, readAlign, trunc = aligner.align(read)

    eq_(readAlign, 'ACCTAGGTTCGACATGTACc')
    eq_(graphAlign, 'AGCTAGGTTCGACAAGTCC-')
Esempio n. 12
0
def test_align_nothing():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "ACCAAGGCTCGAGATTTACC"

    aligner = khmer.ReadAligner(ch, 0, 0)
    for _ in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    score, graphAlign, readAlign, trunc = aligner.align(read)

    print(score, graphAlign, readAlign)

    assert trunc
    assert len(graphAlign) == 0
    assert len(readAlign) == 0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    parser.add_argument('--trusted', type=int, default=5)
    parser.add_argument('--variants-out',
                        type=str,
                        default='variants.txt',
                        dest='variants_out')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.ReadAligner(ct, args.trusted, 1.0)

    for record in screed.open(args.ref):
        seq = record.sequence
        seq = seq.replace('N', 'A')

        score, alignment = align_long(ct, aligner, seq)

        g = alignment.g
        r = alignment.r

        m, n = alignment.compare()
        print record.name, m, n, n - m, "%.3f%%" % (float(m) / n * 100)
        for start in range(0, len(alignment), 60):
            print start
            print alignment[start:start + 60]

        gidx = AlignmentIndex(alignment)
        fp = open(args.variants_out, 'w')

        for gi, a, b in alignment.variants():
            kmer = ''
            pos = gi
            while len(kmer) < ct.ksize() and pos < len(alignment.g):
                ch = alignment.g[pos]
                pos += 1
                if ch in '=-':
                    continue
                kmer += ch

            if alignment.covs[gi]:
                print >> fp, gi, a, b, gidx.get_ri(
                    gi), kmer, alignment.covs[gi]

        if 0:
            print len(seq), alignment.refseqlen()
            gidx._sanityCheck(seq)
Esempio n. 14
0
def test_readalign():
    return  # @CTB
    ch = khmer.Countgraph(10, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 1, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    read = "ACCTAGGTTCGACATGTACC"
    #                      ^^            ^  ^

    ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG")

    score, graphAlign, readAlign, _ = aligner.align(read)

    eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
    eq_(graphAlign, 'AGCTAGGTTCGACAAGTCCT')
Esempio n. 15
0
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.ReadAligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq
Esempio n. 16
0
def test_align_fwd_middle_trunc_2():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for _ in range(20):
        ch.consume("AAAAAAAAAAAATCGACAAGTCCTTGACAGAT")

    # omit prefix from graph
    ch.consume(read[12:])
    _, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)

    # this will fail, because align_forward chooses the first kmer as the
    # seed.
    assert not readAlign
    assert not graphAlign
    assert trunc
Esempio n. 17
0
def test_simple_readalign():
    ch = khmer.CountingHash(10, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 2, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACATGTCCTTGACAGAT")
    read = "ACCTAGGTTCGACAAGTACC"
    #                      ^^            ^  ^
    ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG")
    # CCCGGGCCTTTGTCGAACCTTTTTAAAAGC

    score, graphAlign, readAlign, trunc = aligner.align(read)

#                        AGCTAGGTTCGACAAGT CCT
#                        ACCTAGGTTCGACAAGTaCC
#                        --CTAGGTTCGACATGT-CC
    eq_(graphAlign, 'AGCTAGGTTCGACATGTCC-')
    eq_(readAlign, 'ACCTAGGTTCGACAAGTACc')
Esempio n. 18
0
def main():
    parser = build_counting_args()
    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=2)
    parser.add_argument("--theta", type=float, default=1.0)
    parser.add_argument("input_table")
    parser.add_argument("input_filenames", nargs="+")
    add_loadhash_args(parser)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print >> sys.stderr, 'file with ht: %s' % counting_ht

    print >> sys.stderr, 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    aligner = khmer.ReadAligner(
        ht, args.trusted_cov, args.theta
    )  # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts)

    ### the filtering loop
    for infile in infiles:
        print >> sys.stderr, 'aligning', infile
        for n, record in enumerate(screed.open(infile)):

            name = record['name']
            seq = record['sequence'].upper()
            print >> sys.stderr, name
            print >> sys.stderr, seq

            score, graph_alignment, read_alignment, truncated = aligner.align(
                seq)
            print >> sys.stderr, score
            print >> sys.stderr, graph_alignment
            print >> sys.stderr, read_alignment
            print >> sys.stderr, truncated
            print ">{0}\n{1}".format(name, graph_alignment)
Esempio n. 19
0
def main():
    hash_filename = sys.argv[1]
    input_filename = sys.argv[2]
    output_filename = sys.argv[3]
    max_error_region = int(sys.argv[4])

    C = 20  # 20

    corrected = 0
    uncorrected = 0

    outfp = open(output_filename, 'w')

    ht = khmer.load_counting_hash(hash_filename)
    aligner = khmer.ReadAligner(ht, 1, C, max_error_region)

    K = ht.ksize()

    for n, record in enumerate(screed.open(input_filename)):
        if n % 1000 == 0:
            print n

        seq = record.sequence
        seq_name = record.name

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            corrected += 1
            outfp.write('>%s\n%s\n' % (seq_name, graph_seq))
        else:
            uncorrected += 1
            outfp.write('>%s\n%s\n' % (seq_name, seq))


    print 'corrected', corrected
    print 'uncorrected', uncorrected

    outfp.close()
Esempio n. 20
0
def test_align_fwd_middle_trunc():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "TCGACAAGTCCTTGACAGATGGGGGG"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")

    # omit suffix from graph
    ch.consume(read[:-5])
    score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)

    # should not be the same...
    neq_(readAlign, read)
    neq_(graphAlign, read)

    eq_(readAlign, read[:-5])
    eq_(graphAlign, read[:-5])

    # ...but truncated
    assert trunc
Esempio n. 21
0
def test_align_middle_trunc_2():
    ch = khmer.Countgraph(10, 1048576, 1)
    read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for _ in range(20):
        ch.consume("AAAAAAAAAAAATCGACAAGTCCTTGACAGAT")

    # omit prefix from graph
    ch.consume(read[12:])
    _, graphAlign, readAlign, trunc = aligner.align(read)

    # here, the alignment must start not at the beginning
    print(readAlign)
    print(graphAlign)

    eq_(readAlign, read[12:])
    eq_(graphAlign, read[12:])

    # ...but truncated
    assert trunc
Esempio n. 22
0
def test_align_fwd_covs_1():
    K = 10

    ch = khmer.Countgraph(K, 1048576, 1)
    read = "GTCGACAAGTCCTTGACAGAT"
    aligner = khmer.ReadAligner(ch, 0, 0)
    for i in range(19):
        ch.consume(read)

    ch.consume("CTCGACAAGTCCTTGACAGAT")
    #           ^
    score, g, r, is_t, covs = aligner.align_forward(read)

    for start in range(0, len(read) - K + 1):
        print(ch.get(read[start:start + K]), end=' ')
    print('')

    assert len(covs) == len(read)
    assert covs[0] == 19
    assert min(covs[1:-K]) == 20, covs
    assert max(covs) == 20, covs
def test_2():
    ct = khmer.new_counting_hash(20, 1.1e6, 4)
    ct.consume_fasta('simple-haplo-reads.fa.keep')
    aligner = khmer.ReadAligner(ct, 5, 1.0)

    seq = "".join("""GTCCTGGCGGTCCCCATTCA
    CTGCCATTGCCCCAAGCATGTTGGGGCGAGACCCTAGCGCATCTATTGACGATAGTCTAAATCGGCGAATTACGTAGCT
    GTAGGAAGTCACATGTGCTAAATATCAG
    TGATTCGCATCTTTCACCGCCGTACCAAGTGGAACCGGGGCCACCGCGTGTGTTATAACCTAT
    """.strip().split())
    
    seq = list(screed.open('simplefoo.fa'))[0].sequence

    score, alignment = align_long(ct, aligner, seq)

    print len(seq), alignment.refseqlen()

    for start in range(0, len(alignment), 60):
        print alignment[start:start+60]
    
    gidx = AlignmentIndex(alignment)
    gidx._sanityCheck(seq)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.ReadAligner(ct, 5, 1.0)

    for record in screed.open(args.ref):
        s = record.sequence
        s = s.replace('N', 'A')

        score, graph_alignment, read_alignment, truncated = \
               aligner.align(s)

        #assert not truncated

        g = graph_alignment  #.replace('-', '')
        r = read_alignment  #.replace('-', '')

        line1 = []
        line2 = []
        line3 = []
        for n, (a, b) in enumerate(zip(g, r)):
            line1.append(a)
            line3.append(b)
            if a != b:
                line2.append(' ')
            else:
                line2.append('|')

        print '::', record.name, score, truncated
        for start in range(0, len(line1), 60):
            print "".join(line1[start:start + 60])
            print "".join(line2[start:start + 60])
            print "".join(line3[start:start + 60])
            print '--'
Esempio n. 25
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument(
        "--bits-theta",
        help=
        "Tuning parameter controlling trade off of speed vs alignment sensitivity",
        default=1.0,
        type=float,
        dest="bits_theta")
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_tables
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_tablesize
        print >> sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % (
            args.n_tables * args.min_tablesize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out != None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepalign'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##
            score, graph_alignment, read_alignment, truncated = aligner.align(
                record.sequence)

            keep = False
            if truncated:
                keep = True
            else:
                if False:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                mincount = ht.get_min_count(graph_seq)
                keep = True
                seq = graph_seq

                #if mincount < DESIRED_COVERAGE:
                #    keep = True
                #    seq = graph_seq
                #else:
                #    assert not keep

            if details_out != None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      {6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    .format(score, graph_alignment, read_alignment, truncated,
                            keep, seq, record.sequence, record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, seq))
            else:
                discarded += 1

        if total:
            print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Esempio n. 26
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument("--bits-theta",
                        help="Tuning parameter controlling"
                        "trade off of speed vs alignment sensitivity",
                        default=1.0,
                        type=float,
                        dest="bits_theta")
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base cutoff on abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print('\nPARAMETERS:', file=sys.stderr)
        print(' - kmer size =    %d \t\t(-k)' % args.ksize, file=sys.stderr)
        print(' - n hashes =     %d \t\t(-N)' % args.n_tables, file=sys.stderr)
        print(' - min hashsize = %-5.2g \t(-x)' % \
            args.max_tablesize, file=sys.stderr)
        print('', file=sys.stderr)
        print('Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % \
            (args.n_tables * args.max_tablesize), file=sys.stderr)
        print('-' * 8, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    filenames = args.input_filenames

    if args.loadhash:
        print('loading hashtable from', args.loadhash)
        ht = khmer.load_countgraph(args.loadhash)
    else:
        print('making hashtable')
        ht = khmer.Countgraph(K, HT_SIZE, N_HT)

    aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out is not None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepvar'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print('... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%')
                print('... in file', input_filename)

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(record.sequence)

            # next, decide whether or to keep it.
            keep = False
            if truncated:
                keep = True  # keep all truncated alignments - why?
            else:

                # build a better sequence -- this is the corrected one.
                graph_seq = graph_alignment.replace("-", "")
                # OR?
                #graph_seq = ""
                #for i in range(len(graph_alignment)):
                #    if graph_alignment[i] == "-":
                #        graph_seq += read_alignment[i]
                #    else:
                #        graph_seq += graph_alignment[i]

                # get the minimum count for this new sequence
                mincount = ht.get_min_count(graph_seq)
                if mincount < args.normalize_to:
                    keep = True

            if details_out is not None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      "
                    "{6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    "".format(score, graph_alignment, read_alignment,
                              truncated, keep, seq, record.sequence,
                              record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        if total:
            print('DONE with', input_filename, \
                '; kept', total - discarded, 'of', total, 'or', \
                int(100. - discarded / float(total) * 100.), '%')
        print('output in', output_name)

    if args.savehash:
        print('Saving hashfile through', input_filename)
        print('...saving to', args.savehash)
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2)
    print('fp rate estimated to be %1.3f' % fp_rate)
Esempio n. 27
0
def main():
    info('correct-reads.py', ['streaming'])
    args = sanitize_help(get_parser()).parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        check_space_for_graph(
            args.n_tables * args.min_tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print >>sys.stderr, 'loading k-mer countgraph from', args.loadgraph
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print >>sys.stderr, 'making k-mer countgraph'
        ct = khmer.new_countgraph(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
            % (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print >>sys.stderr, ('second pass: looking at sequences kept aside '
                             'in %s') % pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename,
                                             parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print >>sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,)
    print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,)
    print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
        (save_pass2_total, n_passes)
    print >>sys.stderr, 'removed %d reads and corrected %d reads (%.2f%%)' % \
        (n_reads - written_reads, corrected_reads, percent_reads_corrected)
    print >>sys.stderr, 'removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
            (n_reads - skipped_n, percent_reads_hicov)
        print >>sys.stderr, ('skipped %d reads/%d bases because of low'
                             'coverage') % (skipped_n, skipped_bp)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    print >>sys.stderr, 'output in *.corr'

    if args.savegraph:
        print >>sys.stderr, "Saving k-mer countgraph to", args.savegraph
        ct.save(args.savegraph)
Esempio n. 28
0
def test_graph_attribute():
    ch = khmer.Countgraph(10, 1048576, 1)
    aligner = khmer.ReadAligner(ch, 0, 0)
    assert aligner.graph is ch
Esempio n. 29
0
def main():
    parser = argparse.ArgumentParser(description='XXX')

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
    env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
    env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE)

    parser.add_argument('--ksize',
                        '-k',
                        type=int,
                        dest='ksize',
                        default=env_ksize,
                        help='k-mer size to use')
    parser.add_argument('--n_hashes',
                        '-N',
                        type=int,
                        dest='n_hashes',
                        default=env_n_hashes,
                        help='number of hash tables to use')
    parser.add_argument('--hashsize',
                        '-x',
                        type=float,
                        dest='min_hashsize',
                        default=env_hashsize,
                        help='lower bound on hashsize to use')

    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)

    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base cutoff on median k-mer abundance of this',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('--tempdir',
                        '-T',
                        type=str,
                        dest='tempdir',
                        default='./')

    parser.add_argument('input_filenames', nargs='+')
    args = parser.parse_args()

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    NORMALIZE_LIMIT = args.normalize_to

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print 'created temporary directory %s; use -T to change location' % tempdir

    ###

    save_pass2 = 0
    n_aligned = 0
    n_corrected = 0
    total_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        corrfilename = os.path.basename(filename) + '.corr'

        pass2list.append((filename, pass2filename, corrfilename))

        pass2fp = open(pass2filename, 'w')
        corrfp = open(corrfilename, 'w')

        for n, read in enumerate(screed.open(filename)):
            total_reads += 1

            if n % 10000 == 0:
                print '...', n, filename, n_aligned, n_corrected, save_pass2, \
                      total_reads
            seq = read.sequence.replace('N', 'A')

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(read.sequence)

            # next, decide whether or to keep it.
            output_corrected = False
            if not truncated:
                n_aligned += 1

                # build a better sequence -- this is the corrected one.
                if True:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                corrected = graph_seq
                if graph_seq != read.sequence:
                    n_corrected += 1

                # get the minimum count for this new sequence
                mincount = ht.get_min_count(graph_seq)
                if mincount < args.normalize_to:
                    output_corrected = True

            # has this portion of the graph saturated? if not,
            # consume & save => pass2.
            if output_corrected:
                corrfp.write(output_single(read, corrected))
            else:  # uncorrected...
                ht.consume(read.sequence)
                pass2fp.write(output_single(read, read.sequence))
                save_pass2 += 1

        pass2fp.close()
        corrfp.close()

        print '%s: kept aside %d of %d from first pass, in %s' % \
              (filename, save_pass2, n, filename)
        print 'aligned %d of %d reads so far' % (n_aligned, total_reads)
        print 'changed %d of %d reads so far' % (n_corrected, total_reads)

    for orig_filename, pass2filename, corrfilename in pass2list:
        print 'second pass: looking at sequences kept aside in %s' % \
              pass2filename
        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print '... x 2', n, pass2filename, n_aligned, n_corrected, \
                      total_reads

            corrfp = open(corrfilename, 'a')

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(read.sequence)

            if truncated:  # no good alignment; output original
                corrected = read.sequence
            else:
                n_aligned += 1
                # build a better sequence -- this is the corrected one.
                if True:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                corrected = graph_seq
                if corrected != read.sequence:
                    n_corrected += 1

            corrfp.write(output_single(read, corrected))

        print 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    print 'Aligned %d of %d total' % (n_aligned, total_reads)
    print 'Changed %d of %d total' % (n_corrected, total_reads)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('reference')
    parser.add_argument('readfile')
    args = parser.parse_args()

    ct = khmer.new_counting_hash(21, 1e7, 4)

    tags_to_positions = {}
    references = {}
    for record in screed.open(args.reference):
        # store for later retrieval - in memory, for now.
        references[record.name] = record.sequence

        # load into graph & tag
        ct.consume_and_tag(record.sequence)

        # track positions in reference by tag
        tagposns = ct.get_tags_and_positions(record.sequence)
        for pos, tag in tagposns:
            x = tags_to_positions.get(tag, [])
            x.append((record.name, pos))
            tags_to_positions[tag] = x

    # now, walk through the reads and map to graph
    aligner = khmer.ReadAligner(ct, 0, 1.0)
    for read in screed.open(args.readfile):

        # align to graph, where possible
        readseq = read.sequence.replace('N', 'A')
        score, g, r, truncated = aligner.align(readseq)
        if truncated:
            print >> sys.stderr, "IGNORING read", read.name
            continue

        # find locations in reference where read alignment overlaps a tag
        refseq = g.replace('-', '')
        ptags = ct.get_tags_and_positions(refseq)
        assert len(ptags)

        refposns = []
        for pos, tag in ptags:
            refposns.extend(tags_to_positions[tag])

        # extract the larger region, remap read to get exact positions
        regions = turn_locations_into_regions(refposns)
        for (ref, start, end) in regions:

            # pull out reference region
            referenceseq = references[ref]
            start = max(start - REGIONSIZE / 2, 0)
            end = min(end + REGIONSIZE / 2, len(referenceseq))
            regionseq = referenceseq[start:end]

            # align region back to read
            nct = khmer.new_counting_hash(21, 1e5, 4)
            nct.consume(readseq)
            naligner = khmer.ReadAligner(nct, 1, 1.0)
            score, galign = graphAlignment.align_long(nct, naligner, regionseq)

            for n, (a, b) in enumerate(galign):
                if a != '=':
                    break

            o = len(galign)
            while 1:
                (a, b) = galign[o - 1]
                if a != '=':
                    break
                o -= 1

            if '=' in galign[n:o].g:
                assert 0

            gidx = graphAlignment.AlignmentIndex(galign)

            print 'Read %s aligns to %s[%s:%s]' % (read.name, ref, start + n,
                                                   start + o)
            print galign[n:o]