Example #1
0
def test_alignerrorregion():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    read = "AAAAAGTTCGAAAAAGGCACG"
    aligner = khmer.new_readaligner(ch, 1, 20, 11)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACTATTAAAAAAGTTCGAAAAAGGCACGGG")
    graphAlign, readAlign = aligner.align(read)

    assert readAlign == ''
    assert graphAlign == ''
Example #2
0
def test_alignnocov():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    read = "ACCTAGGTTCGACATGTACC"
    aligner = khmer.new_readaligner(ch)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACCTAGGTTCGACATGTACC")
    graphAlign, readAlign = aligner.align(read)

    # should be the same
    assert readAlign == 'ACCTAGGTTCGACATGTACC'
    assert graphAlign == 'ACCTAGGTTCGACATGTACC'
Example #3
0
def test_alignnocov():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    read = "ACCTAGGTTCGACATGTACC"
    aligner = khmer.new_readaligner(ch, 0, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACCTAGGTTCGACATGTACC")
    score, graphAlign, readAlign, trunc = aligner.align(read)

    # should be the same
    eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
    eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
Example #4
0
def test_alignnocov():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    read = "ACCTAGGTTCGACATGTACC"
    aligner = khmer.new_readaligner(ch, 0, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    ch.consume("ACCTAGGTTCGACATGTACC")
    score, graphAlign, readAlign, trunc = aligner.align(read)

    # should be the same
    eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
    eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
Example #5
0
def test_readalign_new():
    ch = khmer.new_counting_hash(32, 1048576, 1)
    aligner = khmer.new_readaligner(ch, 1, 0)
    for seq in ht_seqs:
        ch.consume(seq)

    for query in queries:
        score, graphAlign, readAlign, trunc = aligner.align(query["seq"])
        print graphAlign
        print readAlign
        eq_(graphAlign, query["graph_aln"])
        eq_(readAlign, query["read_aln"])
        eq_(trunc, query["truncated"])
Example #6
0
def test_readalign_new():
    ch = khmer.new_counting_hash(32, 1048576, 1)
    aligner = khmer.new_readaligner(ch, 1, 0)
    for seq in ht_seqs:
        ch.consume(seq)

    for query in queries:
        score, graphAlign, readAlign, trunc = aligner.align(query["seq"])
        print graphAlign
        print readAlign
        eq_(graphAlign, query["graph_aln"])
        eq_(readAlign, query["read_aln"])
        eq_(trunc, query["truncated"])
Example #7
0
def test_readalign():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    aligner = khmer.new_readaligner(ch, 1, 20)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    read = "ACCTAGGTTCGACATGTACC"
    #                      ^^            ^  ^

    ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG")

    graphAlign, readAlign = aligner.align(read)

    assert readAlign == 'ACCTAGGTTCGACATGTACC'
    assert graphAlign == 'AGCTAGGTTCGACAAGT-CC', graphAlign
Example #8
0
def test_readalign():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    aligner = khmer.new_readaligner(ch, 1, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    read = "ACCTAGGTTCGACATGTACC"
    #                      ^^            ^  ^

    ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG")

    score, graphAlign, readAlign, trunc = aligner.align(read)

    eq_(readAlign, 'ACCTAGGTTCGACATGTACc')
    eq_(graphAlign, 'AGCTAGGTTCGACAAGTCC-')
Example #9
0
def test_readalign():
    ch = khmer.new_counting_hash(10, 1048576, 1)
    aligner = khmer.new_readaligner(ch, 1, 0)
    for i in range(20):
        ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
    read = "ACCTAGGTTCGACATGTACC"
    #                      ^^            ^  ^

    ch.consume("GCTTTTAAAAAGGTTCGACAAAGGCCCGGG")

    score, graphAlign, readAlign, trunc = aligner.align(read)

    eq_(readAlign, 'ACCTAGGTTCGACATGTACc')
    eq_(graphAlign, 'AGCTAGGTTCGACAAGTCC-')
Example #10
0
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq
Example #11
0
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record["name"]
        seq = record["sequence"]

        seq = seq.replace("N", "A")

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace("-", "")
            seq = graph_seq

        return name, seq
Example #12
0
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq
Example #13
0
def main():
    parser = build_counting_args()
    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=2)
    parser.add_argument("--theta", type=float, default=1.0)
    parser.add_argument("input_table")
    parser.add_argument("input_filenames", nargs="+")
    add_loadhash_args(parser)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print >> sys.stderr, 'file with ht: %s' % counting_ht

    print >> sys.stderr, 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    aligner = khmer.new_readaligner(
        ht, args.trusted_cov, args.theta
    )  # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts)

    ### the filtering loop
    for infile in infiles:
        print >> sys.stderr, 'aligning', infile
        for n, record in enumerate(screed.open(infile)):

            name = record['name']
            seq = record['sequence'].upper()
            print >> sys.stderr, name
            print >> sys.stderr, seq

            score, graph_alignment, read_alignment, truncated = aligner.align(
                seq)
            print >> sys.stderr, score
            print >> sys.stderr, graph_alignment
            print >> sys.stderr, read_alignment
            print >> sys.stderr, truncated
            print ">{0}\n{1}".format(name, graph_alignment)
Example #14
0
def main():
    hash_filename = sys.argv[1]
    input_filename = sys.argv[2]
    output_filename = sys.argv[3]
    max_error_region = int(sys.argv[4])

    C = 20  # 20

    corrected = 0
    uncorrected = 0

    outfp = open(output_filename, 'w')

    ht = khmer.load_counting_hash(hash_filename)
    aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

    K = ht.ksize()

    for n, record in enumerate(screed.open(input_filename)):
        if n % 1000 == 0:
            print n

        seq = record.sequence
        seq_name = record.name

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            corrected += 1
            outfp.write('>%s\n%s\n' % (seq_name, graph_seq))
        else:
            uncorrected += 1
            outfp.write('>%s\n%s\n' % (seq_name, seq))


    print 'corrected', corrected
    print 'uncorrected', uncorrected

    outfp.close()
Example #15
0
def main():
    parser = build_counting_args()
    parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2)
    parser.add_argument("--theta", type=float, default=1.0)
    parser.add_argument("input_table")
    parser.add_argument("input_filenames", nargs="+")
    add_loadhash_args(parser)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print >>sys.stderr, 'file with ht: %s' % counting_ht

    print >>sys.stderr, 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    aligner = khmer.new_readaligner(ht, args.trusted_cov, args.theta) # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts)
    
    ### the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'aligning', infile
        for n, record in enumerate(screed.open(infile)):

            name = record['name']
            seq = record['sequence'].upper()
            print >>sys.stderr, name
            print >>sys.stderr, seq

            score, graph_alignment, read_alignment, truncated = aligner.align(seq)
            print >>sys.stderr, score
            print >>sys.stderr, graph_alignment
            print >>sys.stderr, read_alignment
            print >>sys.stderr, truncated
            print ">{0}\n{1}".format(name, graph_alignment)
Example #16
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument(
        "--bits-theta",
        help=
        "Tuning parameter controlling trade off of speed vs alignment sensitivity",
        default=1.0,
        type=float,
        dest="bits_theta")
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >> sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out != None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepalign'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##
            score, graph_alignment, read_alignment, truncated = aligner.align(
                record.sequence)

            keep = False
            if truncated:
                keep = True
            else:
                if False:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                mincount = ht.get_min_count(graph_seq)
                keep = True
                seq = graph_seq

                #if mincount < DESIRED_COVERAGE:
                #    keep = True
                #    seq = graph_seq
                #else:
                #    assert not keep

            if details_out != None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      {6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    .format(score, graph_alignment, read_alignment, truncated,
                            keep, seq, record.sequence, record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, seq))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Example #17
0
def main():
    parser = argparse.ArgumentParser(description='XXX')

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
    env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
    env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE)

    parser.add_argument('--ksize',
                        '-k',
                        type=int,
                        dest='ksize',
                        default=env_ksize,
                        help='k-mer size to use')
    parser.add_argument('--n_hashes',
                        '-N',
                        type=int,
                        dest='n_hashes',
                        default=env_n_hashes,
                        help='number of hash tables to use')
    parser.add_argument('--hashsize',
                        '-x',
                        type=float,
                        dest='min_hashsize',
                        default=env_hashsize,
                        help='lower bound on hashsize to use')

    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)

    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base cutoff on median k-mer abundance of this',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('--tempdir',
                        '-T',
                        type=str,
                        dest='tempdir',
                        default='./')

    parser.add_argument('input_filenames', nargs='+')
    args = parser.parse_args()

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    NORMALIZE_LIMIT = args.normalize_to

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cov, args.bits_theta)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print 'created temporary directory %s; use -T to change location' % tempdir

    ###

    save_pass2 = 0
    n_aligned = 0
    n_corrected = 0
    total_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        corrfilename = os.path.basename(filename) + '.corr'

        pass2list.append((filename, pass2filename, corrfilename))

        pass2fp = open(pass2filename, 'w')
        corrfp = open(corrfilename, 'w')

        for n, read in enumerate(screed.open(filename)):
            total_reads += 1

            if n % 10000 == 0:
                print '...', n, filename, n_aligned, n_corrected, save_pass2, \
                      total_reads
            seq = read.sequence.replace('N', 'A')

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(read.sequence)

            # next, decide whether or to keep it.
            output_corrected = False
            if not truncated:
                n_aligned += 1

                # build a better sequence -- this is the corrected one.
                if True:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                corrected = graph_seq
                if graph_seq != read.sequence:
                    n_corrected += 1

                # get the minimum count for this new sequence
                mincount = ht.get_min_count(graph_seq)
                if mincount < args.normalize_to:
                    output_corrected = True

            # has this portion of the graph saturated? if not,
            # consume & save => pass2.
            if output_corrected:
                corrfp.write(output_single(read, corrected))
            else:  # uncorrected...
                ht.consume(read.sequence)
                pass2fp.write(output_single(read, read.sequence))
                save_pass2 += 1

        pass2fp.close()
        corrfp.close()

        print '%s: kept aside %d of %d from first pass, in %s' % \
              (filename, save_pass2, n, filename)
        print 'aligned %d of %d reads so far' % (n_aligned, total_reads)
        print 'changed %d of %d reads so far' % (n_corrected, total_reads)

    for orig_filename, pass2filename, corrfilename in pass2list:
        print 'second pass: looking at sequences kept aside in %s' % \
              pass2filename
        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print '... x 2', n, pass2filename, n_aligned, n_corrected, \
                      total_reads

            corrfp = open(corrfilename, 'a')

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(read.sequence)

            if truncated:  # no good alignment; output original
                corrected = read.sequence
            else:
                n_aligned += 1
                # build a better sequence -- this is the corrected one.
                if True:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                corrected = graph_seq
                if corrected != read.sequence:
                    n_corrected += 1

            corrfp.write(output_single(read, corrected))

        print 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    print 'Aligned %d of %d total' % (n_aligned, total_reads)
    print 'Changed %d of %d total' % (n_corrected, total_reads)
Example #18
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3)
    parser.add_argument("--bits-theta", help="Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta")
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >>sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta)
            
    if args.details_out != None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepalign'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##
            score, graph_alignment, read_alignment, truncated = aligner.align(record.sequence)

            keep = False
            if truncated:
                keep = True
            else:
                if False:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                mincount = ht.get_min_count(graph_seq)
                keep = True
                seq = graph_seq

                #if mincount < DESIRED_COVERAGE:
                #    keep = True
                #    seq = graph_seq
                #else:
                #    assert not keep

            if details_out != None:
                details_out.write("+{7}\t{0:0.2f}\t{3}\t{4}\nread:      {6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name))


            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, seq))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
Example #19
0
def main():
    parser = argparse.ArgumentParser(description='XXX')

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
    env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
    env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE)

    parser.add_argument('--ksize', '-k', type=int, dest='ksize',
                        default=env_ksize,
                        help='k-mer size to use')
    parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes',
                        default=env_n_hashes,
                        help='number of hash tables to use')
    parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize',
                        default=env_hashsize,
                        help='lower bound on hashsize to use')

    parser.add_argument("--trusted-cov", dest="trusted_cov", type=int,
                        default=2)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)

    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='base cutoff on median k-mer abundance of this',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('--tempdir', '-T', type=str, dest='tempdir',
                        default='./')

    parser.add_argument('input_filenames', nargs='+')
    args = parser.parse_args()

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    NORMALIZE_LIMIT = args.normalize_to

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cov, args.bits_theta)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print 'created temporary directory %s; use -T to change location' % tempdir

    ###

    save_pass2 = 0
    n_aligned = 0
    n_corrected = 0
    total_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        corrfilename = os.path.basename(filename) + '.corr'

        pass2list.append((filename, pass2filename, corrfilename))

        pass2fp = open(pass2filename, 'w')
        corrfp = open(corrfilename, 'w')

        for n, read in enumerate(screed.open(filename)):
            total_reads += 1

            if n % 10000 == 0:
                print '...', n, filename, n_aligned, n_corrected, save_pass2, \
                      total_reads
            seq = read.sequence.replace('N', 'A')

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(read.sequence)

            # next, decide whether or to keep it.
            output_corrected = False
            if not truncated:
                n_aligned += 1

                # build a better sequence -- this is the corrected one.
                if True:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                corrected = graph_seq
                if graph_seq != read.sequence:
                    n_corrected += 1

                # get the minimum count for this new sequence
                mincount = ht.get_min_count(graph_seq)
                if mincount < args.normalize_to:
                    output_corrected = True

            # has this portion of the graph saturated? if not,
            # consume & save => pass2.
            if output_corrected:
                corrfp.write(output_single(read, corrected))
            else:  # uncorrected...
                ht.consume(read.sequence)
                pass2fp.write(output_single(read, read.sequence))
                save_pass2 += 1

        pass2fp.close()
        corrfp.close()

        print '%s: kept aside %d of %d from first pass, in %s' % \
              (filename, save_pass2, n, filename)
        print 'aligned %d of %d reads so far' % (n_aligned, total_reads)
        print 'changed %d of %d reads so far' % (n_corrected, total_reads)

    for orig_filename, pass2filename, corrfilename in pass2list:
        print 'second pass: looking at sequences kept aside in %s' % \
              pass2filename
        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print '... x 2', n, pass2filename, n_aligned, n_corrected, \
                      total_reads

            corrfp = open(corrfilename, 'a')

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(read.sequence)

            if truncated:               # no good alignment; output original
                corrected = read.sequence
            else:
                n_aligned += 1
                # build a better sequence -- this is the corrected one.
                if True:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                corrected = graph_seq
                if corrected != read.sequence:
                    n_corrected += 1

            corrfp.write(output_single(read, corrected))

        print 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    print 'Aligned %d of %d total' % (n_aligned, total_reads)
    print 'Changed %d of %d total' % (n_corrected, total_reads)
Example #20
0
import math

hash_filename = sys.argv[1]
input_filename = sys.argv[2]
output_filename = sys.argv[3]
max_error_region = int(sys.argv[4])

C = 20  # 20

corrected = 0
uncorrected = 0

outfp = open(output_filename, 'w')

ht = khmer.load_counting_hash(hash_filename)
aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

K = ht.ksize()

for n, record in enumerate(screed.open(input_filename)):
    if n % 1000 == 0:
        print n

    seq = record.sequence
    seq_name = record.name

    seq = seq.replace('N', 'A')

    grXreAlign, reXgrAlign = aligner.align(seq)

    if len(reXgrAlign) > 0:
Example #21
0
File: ec.py Project: jiarong/khmer
import math

hash_filename = sys.argv[1]
input_filename = sys.argv[2]
output_filename = sys.argv[3]
max_error_region = int(sys.argv[4])

C = 20  # 20

corrected = 0
uncorrected = 0

outfp = open(output_filename, "w")

ht = khmer.load_counting_hash(hash_filename)
aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

K = ht.ksize()

for n, record in enumerate(screed.open(input_filename)):
    if n % 1000 == 0:
        print n

    seq = record.sequence
    seq_name = record.name

    seq = seq.replace("N", "A")

    grXreAlign, reXgrAlign = aligner.align(seq)

    if len(reXgrAlign) > 0: