Beispiel #1
0
def main():
    parser = argparse.ArgumentParser(description="Get reads coverage matrix")
    
    parser.add_argument('hashname1')
    parser.add_argument('hashname2')
    parser.add_argument('hashname3')
    parser.add_argument('file1')
    parser.add_argument('file2')
    parser.add_argument('file3')
    parser.add_argument('output')

    args = parser.parse_args()
    hashname1 = args.hashname1
    hashname2 = args.hashname2
    hashname3 = args.hashname3
    output  = args.output
    file1 = args.file1
    file2 = args.file2
    file3 = args.file3
    outfp = open(output, 'w')

    print 'hashtable from', hashname1
    ht1 = khmer.load_counting_hash(hashname1)
    print 'hashtable from', hashname2
    ht2 = khmer.load_counting_hash(hashname2)
    print 'hashtable from', hashname3
    ht3 = khmer.load_counting_hash(hashname3)
    matrix = {}

    set_x = set()
    set_y = set()
    set_z = set()
    
    for file_n in [file1,file2,file3]:
        print 'reading reads file ',file_n
        for n, record in enumerate(screed.open(file_n)):
            if n > 0 and n % 100000 == 0:#100000
                print '...', n, file_n
            seq = record.sequence.replace('N', 'A')
            med1, _, _ = ht1.get_median_count(seq)
            set_x.add(med1)
            med2, _, _ = ht2.get_median_count(seq)
            set_y.add(med2)
            med3, _, _ = ht3.get_median_count(seq)
            set_z.add(med3)
            key = str(med1)+'-'+str(med2)+'-'+str(med3)
            matrix[key] = matrix.get(key,0) + 1


    for x in range(max(list(set_x))):
        for y in range(max(list(set_y))):
            for z in range(max(list(set_z))):
                to_print = str(x)+'-'+str(y)+' '+ str(z)+ ' ' +\
                str(matrix.get(str(x)+'-'+str(y)+'-'+str(z),0))+'\n'
            
                outfp.write(to_print)

    outfp.close()
Beispiel #2
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = "1"

    infiles = [utils.get_temp_filename("test-0.fq")]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(utils.get_temp_filename("test-{x}.fq".format(x=x), tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data("test-fastq-reads.fq"), infile)

    script = scriptpath("normalize-by-median.py")
    args = ["-d", "2", "-C", CUTOFF, "-k", "17"]
    args.extend(infiles)

    (status, out, err) = runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, "backup.ht"))
    test_good_read = "CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT"
    test_good_read2 = "TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA"
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, "backup.ht"))
    assert out.count("Backup: Saving") == 2
    assert "Nothing" in out
def main():

    hashfile = sys.argv[1]
    filename = sys.argv[2]
    figure = sys.argv[3]

    ht = khmer.load_counting_hash(hashfile)

    outabund = open(os.path.basename(filename) + '.counts', 'w')

    counts = []
    d = {}
    for sequence in open(sys.argv[2]):
        sequence = sequence.strip()

        count = ht.get(sequence)
        counts.append(count)
        d[count] = d.get(count, 0) + 1

        if count > 1000:
            print >> outabund, sequence, count

    outfp = open(figure + '.countshist', 'w')
    sofar = 0
    sofar_cumu = 0
    for k in sorted(d.keys()):
        sofar += d[k]
        sofar_cumu += k * d[k]
        print >> outfp, k, d[k], sofar, sofar_cumu

    hist(counts, normed=True, cumulative=True, bins=100, range=(1, 1000))
    savefig(figure)
Beispiel #4
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = '1'

    infiles = [utils.get_temp_filename('test-0.fq')]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(
            utils.get_temp_filename('test-{x}.fq'.format(x=x), tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-d', '2', '-C', CUTOFF, '-k', '17']
    args.extend(infiles)

    (status, out, err) = utils.runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct'))
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, 'backup.ct'))
    assert out.count('Backup: Saving') == 2
    assert 'Nothing' in out
Beispiel #5
0
def test_normalize_by_median_force():
    CUTOFF = '1'

    corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
    good_infile = utils.get_temp_filename('test-good.fq',
                                          tempdir=os.path.dirname(
                                              corrupt_infile))

    in_dir = os.path.dirname(good_infile)

    shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]

    (status, out, err) = runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed')
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0
    assert os.path.exists(corrupt_infile + '.ct.failed')
    assert '*** Skipping' in err
    assert '** IOErrors' in err
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print 'file with ht: %s' % counting_ht
    print '-- settings:'
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                return name, trim_seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('hashname')
    parser.add_argument('datafiles', nargs='+')

    args = parser.parse_args()
    hashfile = args.hashname
    datafiles = args.datafiles

    print 'loading counting hash'
    ht = khmer.load_counting_hash(hashfile)
    print 'loaded.'

    for datafile in datafiles:
        print 'annotating', datafile

        outfile = os.path.basename(datafile) + '.kannot'
        outfp = open(outfile, 'w')

        for n, record in enumerate(screed.open(datafile)):
            if n % 1000 == 0:
                print '...', n
            med, _, _ = ht.get_median_count(record.sequence)
            outfp.write('>%s kmed=%d\n%s\n' % (record.name, med,
                                               record.sequence))
Beispiel #8
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = '1'

    infiles = [utils.get_temp_filename('test-0.fq')]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x),
                                               tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-d', '2', '-C', CUTOFF, '-k', '17']
    args.extend(infiles)

    (status, out, err) = runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct'))
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, 'backup.ct'))
    assert out.count('Backup: Saving') == 2
    assert 'Nothing' in out
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser(description="Output k-mer abundance distribution.")
    
    parser.add_argument('hashname')
    parser.add_argument('seqfile')
    #parser.add_argument('histout')

    args = parser.parse_args()
    hashfile = args.hashname
    seqfile = args.seqfile
    #histout = args.histout

    fp = open(seqfile.split('.fa')[0] + '.cov.fa', 'w')

    print 'hashtable from', hashfile
    ht = khmer.load_counting_hash(hashfile)

    hist = {}

    for n, record in enumerate(screed.open(seqfile)):
        if n > 0 and n % 100000 == 0:
            print '...', n

        seq = record.sequence.replace('N', 'A')
        med, _, _ = ht.get_median_count(seq)

        print >>fp, '>%s_[cov=%f]' % (record.name, med)
        print >>fp, '%s' % record.sequence
Beispiel #10
0
def test_abund_dist_gz_bigcount():
    infile = utils.get_temp_filename('test.fa')
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    outfile = utils.get_temp_filename('test_ct.gz')
    script = 'load-into-counting.py'
    htfile = utils.get_temp_filename('test_ct')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = open(htfile, 'rb').read()
    f_out = gzip.open(outfile, 'wb')  # compress the created bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    try:
        counting_hash = khmer.load_counting_hash(outfile)
    except IOError as err:
        assert 0, 'Should not produce IOError: ' + str(err)
    hashsizes = counting_hash.hashsizes()
    kmer_size = counting_hash.ksize()
    tracking = khmer._Hashbits(kmer_size, hashsizes)
    abundances = counting_hash.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print(_, i)
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Beispiel #11
0
def main():
    info('count-median.py', ['diginorm'])
    args = get_parser().parse_args()

    htfile = args.ctfile
    input_filename = args.input
    output_filename = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print 'loading k-mer counting table from', htfile
    htable = khmer.load_counting_hash(htfile)
    ksize = htable.ksize()

    print 'writing to', output_filename
    output = open(output_filename, 'w')

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'G')

        if ksize <= len(seq):
            medn, ave, stdev = htable.get_median_count(seq)
            print >> output, record.name, medn, ave, stdev, len(seq)
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.ReadAligner(ct, 5, 1.0)

    for record in screed.open(args.ref):
        s = record.sequence
        s = s.replace('N', 'A')

        score, graph_alignment, read_alignment, truncated = \
               aligner.align(s)

        assert not truncated

        g = graph_alignment.replace('-', '')
        r = read_alignment.replace('-', '')

        print record.name
        for kstart in range(0, len(g) - ct.ksize() + 1):
            kmer = g[kstart:kstart + ct.ksize()]
            print kstart, ct.get(kmer)
Beispiel #13
0
def test_normalize_by_median_force():
    CUTOFF = '1'

    corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
    good_infile = utils.get_temp_filename(
        'test-good.fq', tempdir=os.path.dirname(corrupt_infile))

    in_dir = os.path.dirname(good_infile)

    shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]

    (status, out, err) = utils.runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed')
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0
    assert os.path.exists(corrupt_infile + '.ct.failed')
    assert '*** Skipping' in err
    assert '** IOErrors' in err
Beispiel #14
0
def main():
    info('count-median.py', ['diginorm'])
    args = get_parser().parse_args()

    htfile = args.ctfile
    input_filename = args.input
    output_filename = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print 'loading k-mer counting table from', htfile
    htable = khmer.load_counting_hash(htfile)
    ksize = htable.ksize()

    print 'writing to', output_filename
    output = open(output_filename, 'w')

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'G')

        if ksize <= len(seq):
            medn, ave, stdev = htable.get_median_count(seq)
            print >> output, record.name, medn, ave, stdev, len(seq)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("hashname")
    parser.add_argument("datafiles", nargs="+")

    args = parser.parse_args()
    hashfile = args.hashname
    datafiles = args.datafiles

    print "loading counting hash"
    ht = khmer.load_counting_hash(hashfile)
    print "loaded."

    for datafile in datafiles:
        print "annotating", datafile

        outfile = os.path.basename(datafile) + ".kannot"
        outfp = open(outfile, "w")

        for n, record in enumerate(screed.open(datafile)):
            if n % 1000 == 0:
                print "...", n
            med, _, _ = ht.get_median_count(record.sequence)
            outfp.write(">%s kmed=%d\n%s\n" % (record.name, med, record.sequence))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('index')
    parser.add_argument('reads')
    args = parser.parse_args()

    print >>sys.stderr, "Loading graph & labels"
    cg = khmer.load_counting_hash(args.index + '.graph')
    lh = khmer._LabelHash(cg)
    lh.load_labels_and_tags(args.index + '.labels')
    fp = open(args.index + '.list', 'rb')
    names = load(fp)
    fp.close()

    print >>sys.stderr, 'loaded %d references' % (len(names),)
    aligner = khmer.ReadAligner(cg, 1, 1.0)

    # run through all the queries, align, and use alignments to look up
    # the label.
    for record in screed.open(args.reads):
        # build alignments against cg
        _, ga, ra, truncated = aligner.align(record.sequence)

        # now grab the tags associated with the alignment
        ga = ga.replace('-', '')
        labels = lh.sweep_label_neighborhood(ga)

        # retrieve the labels associated with the tags
        matches = set([ names[i] for i in labels ])

        # print out the matches.
        print record.name, len(matches), ", ".join(matches)
Beispiel #17
0
def main():
    parser = argparse.ArgumentParser(
        description="Output k-mer abundance distribution.")

    parser.add_argument('hashname')
    parser.add_argument('seqfile')
    parser.add_argument('histout')

    args = parser.parse_args()
    hashfile = args.hashname
    seqfile = args.seqfile
    histout = args.histout

    outfp = open(histout, 'w')

    print 'hashtable from', hashfile
    ht = khmer.load_counting_hash(hashfile)

    hist = {}

    for n, record in enumerate(screed.open(seqfile)):
        if n > 0 and n % 100000 == 0:
            print '...', n

        seq = record.sequence.replace('N', 'A')
        med, _, _ = ht.get_median_count(seq)

        hist[med] = hist.get(med, 0) + 1

    maxk = max(hist.keys())

    for i in range(maxk + 1):
        outfp.write('%d %d\n' % (i, hist.get(i, 0)))
    outfp.close()
Beispiel #18
0
def main():
    parser = argparse.ArgumentParser(
        description='Count k-mers summary stats for sequences')

    parser.add_argument('htfile')
    parser.add_argument('input')
    parser.add_argument('output')

    args = parser.parse_args()

    htfile = args.htfile
    input_filename = args.input
    output_filename = args.output

    print 'loading counting hash from', htfile
    ht = khmer.load_counting_hash(htfile)
    K = ht.ksize()

    print 'writing to', output_filename
    output = open(output_filename, 'w')

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'G')

        if K <= len(seq):
            a, b, c = ht.get_median_count(seq)
            print >>output, record.name, a, b, c, len(seq)
Beispiel #19
0
def main():
    info('count-kmers.py', ['counting'])
    args = get_parser().parse_args()

    print ('hashtable from', args.input_counting_table_filename,
           file=sys.stderr)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._Hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    for filename in args.input_sequence_filenames:
        for record in screed.open(filename):
            seq = record.sequence.replace('N', 'A')
            for i in range(len(seq) - kmer_size + 1):
                kmer = seq[i:i+kmer_size]
                if not tracking.get(kmer):
                    tracking.count(kmer)
                    writer.writerow([kmer, str(counting_hash.get(kmer))])

    print ('Total number of unique k-mers: {0}'.format(
        counting_hash.n_unique_kmers()), file=sys.stderr)
Beispiel #20
0
def main():
    info('count-kmers.py', ['counting'])
    args = get_parser().parse_args()

    print('hashtable from',
          args.input_counting_table_filename,
          file=sys.stderr)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._Hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    for filename in args.input_sequence_filenames:
        for record in screed.open(filename):
            seq = record.sequence.replace('N', 'A')
            for i in range(len(seq) - kmer_size + 1):
                kmer = seq[i:i + kmer_size]
                if not tracking.get(kmer):
                    tracking.count(kmer)
                    writer.writerow([kmer, str(counting_hash.get(kmer))])

    print('Total number of unique k-mers: {0}'.format(
        counting_hash.n_unique_kmers()),
          file=sys.stderr)
def main():
    parser = argparse.ArgumentParser(
        description="Output k-mer abundance distribution.")

    parser.add_argument('hashname')
    parser.add_argument('seqfile')
    parser.add_argument('histout')

    args = parser.parse_args()
    hashfile = args.hashname
    seqfile = args.seqfile
    histout = args.histout

    outfp = open(histout, 'w')

    print 'hashtable from', hashfile
    ht = khmer.load_counting_hash(hashfile)

    hist = {}

    for n, record in enumerate(screed.open(seqfile)):
        if n > 0 and n % 100000 == 0:
            print '...', n

        seq = record.sequence.replace('N', 'A')
        med, _, _ = ht.get_median_count(seq)

        hist[med] = hist.get(med, 0) + 1

    maxk = max(hist.keys())

    for i in range(maxk + 1):
        outfp.write('%d %d\n' % (i, hist.get(i, 0)))
    outfp.close()
Beispiel #22
0
def main():
    parser = argparse.ArgumentParser(description="Reads coverage increase")
    
    parser.add_argument('hashname')
    parser.add_argument('output')
    parser.add_argument('input_filename')

    args = parser.parse_args()
    hashfile = args.hashname
    histout = args.output
    filename = args.input_filename
    print filename
    outfp = open(histout, 'w')

    print 'hashtable from', hashfile
    ht = khmer.load_counting_hash(hashfile)
    count = 0
    for n, record in enumerate(screed.open(filename)):
        if n > 0 and n % 100000 == 0:#100000
            print '...', n
            outfp.write('%d %d %f\n' % (n, count, float(count)/n))

        seq = record.sequence.replace('N', 'A')
        med, _, _ = ht.get_median_count(seq)
        if med > 0:
            count = count + 1

    outfp.close()
def main():

    hashfile = sys.argv[1]
    filename = sys.argv[2]
    figure = sys.argv[3]

    ht = khmer.load_counting_hash(hashfile)

    outabund = open(os.path.basename(filename) + '.counts', 'w')

    counts = []
    d = {}
    for sequence in open(sys.argv[2]):
        sequence = sequence.strip()

        count = ht.get(sequence)
        counts.append(count)
        d[count] = d.get(count, 0) + 1

        if count > 1000:
            print >>outabund, sequence, count

    outfp = open(figure + '.countshist', 'w')
    sofar = 0
    sofar_cumu = 0
    for k in sorted(d.keys()):
        sofar += d[k]
        sofar_cumu += k * d[k]
        print >>outfp, k, d[k], sofar, sofar_cumu

    hist(counts, normed=True, cumulative=True, bins=100, range=(1, 1000))
    savefig(figure)
Beispiel #24
0
def main():
    info('abundance-dist.py', ['counting'])
    args = get_parser().parse_args()
    infiles = [
        args.input_counting_table_filename, args.input_sequence_filename
    ]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print('hashtable from', args.input_counting_table_filename)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._new_hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print('K:', kmer_size)
    print('HT sizes:', hashsizes)
    print('outputting to', args.output_histogram_filename)

    if os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            print('ERROR: %s exists; not squashing.' %
                  args.output_histogram_filename,
                  file=sys.stderr)
            sys.exit(1)

        print('** squashing existing file %s' % args.output_histogram_filename)

    print('preparing hist...')
    abundances = counting_hash.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        print(
            "ERROR: abundance distribution is uniformly zero; "
            "nothing to report.",
            file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)
    hash_fp = open(args.output_histogram_filename, 'w')

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print(_, i, sofar, round(frac, 3), file=hash_fp)

        if sofar == total:
            break
Beispiel #25
0
def main():
    parser = argparse.ArgumentParser(
        description="Output k-mer abundance distribution.")

    parser.add_argument('hashname')
    parser.add_argument('seqfile')
    #parser.add_argument('histout')

    args = parser.parse_args()
    hashfile = args.hashname
    seqfile = args.seqfile
    #histout = args.histout

    fp = open(seqfile.split('.fa')[0] + '.cov.fa', 'w')

    print 'hashtable from', hashfile
    ht = khmer.load_counting_hash(hashfile)

    hist = {}

    for n, record in enumerate(screed.open(seqfile)):
        if n > 0 and n % 100000 == 0:
            print '...', n

        seq = record.sequence.replace('N', 'A')
        med, _, _ = ht.get_median_count(seq)

        print >> fp, '>%s_[cov=%f]' % (record.name, med)
        print >> fp, '%s' % record.sequence
def test_abund_dist_gz_bigcount():
    infile = utils.get_temp_filename('test.fa')
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    outfile = utils.get_temp_filename('test_ct.gz')
    script = scriptpath('load-into-counting.py')
    htfile = utils.get_temp_filename('test_ct')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = open(htfile, 'rb').read()
    f_out = gzip.open(outfile, 'wb')  # compress the created bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    counting_hash = khmer.load_counting_hash(outfile)
    hashsizes = counting_hash.hashsizes()
    kmer_size = counting_hash.ksize()
    tracking = khmer._Hashbits(kmer_size, hashsizes)
    abundances = counting_hash.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print _, i
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Beispiel #27
0
def main():
    parser = build_common_args()
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')

    args = parse_args(parser)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    input_name_list = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0
    for input_filename in input_name_list:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.replace('N', 'A')

            med, _, _ = ht.get_median_count(seq)

            if med < DESIRED_COVERAGE:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of', \
            total, 'or', int(100. - discarded / float(total) * 100.), '%'

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(os.path.basename(args.savehash))
Beispiel #28
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int,
                        help="Diginorm coverage.")
    parser.add_argument('--max-error-region',
                        '-M',
                        dest='max_error_region',
                        default=DEFAULT_MAX_ERROR_REGION,
                        type=int,
                        help="Max length of error region allowed")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.corr'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Beispiel #29
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument(
        "--cutoff", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage."
    )
    parser.add_argument(
        "--max-error-region",
        "-M",
        dest="max_error_region",
        default=DEFAULT_MAX_ERROR_REGION,
        type=int,
        help="Max length of error region allowed",
    )
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record["name"]
        seq = record["sequence"]

        seq = seq.replace("N", "A")

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace("-", "")
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".corr"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Beispiel #30
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument(
        "--cutoff", "-C", dest="cutoff", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance."
    )

    parser.add_argument("-V", "--variable-coverage", action="store_true", dest="variable_coverage", default=False)
    parser.add_argument(
        "--normalize-to",
        "-Z",
        type=int,
        dest="normalize_to",
        help="base variable-coverage cutoff on this median k-mer abundance",
        default=DEFAULT_NORMALIZE_LIMIT,
    )

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = ht.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".abundfilt"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Beispiel #31
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('readsfile')
    parser.add_argument('samposfile')
    parser.add_argument('khfile')
    parser.add_argument('-V', '--variable', default=False, action='store_true')

    args = parser.parse_args()

    print >> sys.stderr, 'loading posdict'
    ignore_set = set()
    posdict = dict(read_pos_file(args.samposfile, ignore_set))

    print >> sys.stderr, 'loading kh'
    kh = khmer.load_counting_hash(args.khfile)
    K = kh.ksize()

    count = 0
    for record in screed.open(args.readsfile):
        if record.name in posdict:
            posns2 = posdict[record.name]

            seq = record.sequence.replace('N', 'A')
            posns1 = kh.find_spectral_error_positions(seq, CUTOFF)
            posns1 = add_n_posns(posns1, record.sequence)

            if posns1 != posns2:
                count += 1

                print record.name, posns1, posns2

                sys.stdout.write(record.sequence)
                sys.stdout.write('\n')
                for i in range(len(seq)):
                    if i in posns1:
                        sys.stdout.write('X')
                    else:
                        sys.stdout.write(' ')
                sys.stdout.write('\n')

                for i in range(len(seq)):
                    if i in posns2:
                        sys.stdout.write('Z')
                    else:
                        sys.stdout.write(' ')
                sys.stdout.write('\n')

                for i in range(len(seq) - K + 1):
                    if kh.get(seq[i:i + K]) < CUTOFF:
                        sys.stdout.write('*')
                    else:
                        sys.stdout.write(' ')
                sys.stdout.write('\n')
                print ''

        if count > 1000:
            break
Beispiel #32
0
def main():
    info('abundance-dist.py', ['counting'])
    args = get_parser().parse_args()
    infiles = [args.input_counting_table_filename,
               args.input_sequence_filename]
    for infile in infiles:
        check_file_status(infile)

    check_space(infiles)

    print('hashtable from', args.input_counting_table_filename)
    counting_hash = khmer.load_counting_hash(
        args.input_counting_table_filename)

    kmer_size = counting_hash.ksize()
    hashsizes = counting_hash.hashsizes()
    tracking = khmer._new_hashbits(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print('K:', kmer_size)
    print('HT sizes:', hashsizes)
    print('outputting to', args.output_histogram_filename)

    if os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            print('ERROR: %s exists; not squashing.' %
                  args.output_histogram_filename,
                  file=sys.stderr)
            sys.exit(1)

        print('** squashing existing file %s' % args.output_histogram_filename)

    print('preparing hist...')
    abundances = counting_hash.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        print("ERROR: abundance distribution is uniformly zero; "
              "nothing to report.", file=sys.stderr)
        print("\tPlease verify that the input files are valid.",
              file=sys.stderr)
        sys.exit(1)
    hash_fp = open(args.output_histogram_filename, 'w')

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print(_, i, sofar, round(frac, 3), file=hash_fp)

        if sofar == total:
            break
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('index')
    parser.add_argument('reads')
    args = parser.parse_args()

    print >>sys.stderr, "Loading graph & labels"
    cg = khmer.load_counting_hash(args.index + '.graph')
    lh = khmer._LabelHash(cg)
    lh.load_labels_and_tags(args.index + '.labels')
    fp = open(args.index + '.list', 'rb')
    names = load(fp)
    fp.close()

    print >>sys.stderr, 'loaded %d references' % (len(names),)
    aligner = khmer.ReadAligner(cg, 1, 1.0)

    counts = {}
    for k in names:
        counts[k] = 0

    # run through all the queries, align, and use alignments to look up
    # the label.
    for n, record in enumerate(screed.open(args.reads)):
        if n % 1000 == 0:
            print >>sys.stderr, '...', n

        # build alignments against cg
        seq = record.sequence.replace('N', 'A')
        _, ga, ra, truncated = aligner.align(seq)

        if len(ga) < 0.8 * len(seq):
            continue

        # now grab the tags associated with the alignment
        ga = ga.replace('-', '')
        tags = lh.sweep_tag_neighborhood(ga)

        if not tags:
            continue

        intersect = set(lh.get_tag_labels(tags[0]))
        for tag in tags[1:]:
            intersect.intersection_update(lh.get_tag_labels(tag))

        if not intersect:                  # ignore confused reads
            continue

        # retrieve the labels associated with the tags
        matches = list(set([ names[i] for i in intersect ]))

        hit = random.choice(matches)
        counts[hit] += 1

    for k, v in counts.iteritems():
        if v:
            print k, v
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('readsfile')
    parser.add_argument('samposfile')
    parser.add_argument('khfile')
    parser.add_argument('-V', '--variable', default=False, action='store_true')
    
    args = parser.parse_args()

    print >>sys.stderr, 'loading posdict'
    ignore_set = set()
    posdict = dict(read_pos_file(args.samposfile, ignore_set))

    print >>sys.stderr, 'loading kh'
    kh = khmer.load_counting_hash(args.khfile)
    K = kh.ksize()

    count = 0
    for record in screed.open(args.readsfile):
        if record.name in posdict:
            posns2 = posdict[record.name]
            
            seq = record.sequence.replace('N', 'A')
            posns1 = kh.find_spectral_error_positions(seq, CUTOFF)
            posns1 = add_n_posns(posns1, record.sequence)

            if posns1 != posns2:
                count += 1
                
                print record.name, posns1, posns2

                sys.stdout.write(record.sequence)
                sys.stdout.write('\n')
                for i in range(len(seq)):
                    if i in posns1:
                        sys.stdout.write('X')
                    else:
                        sys.stdout.write(' ')
                sys.stdout.write('\n')
                    
                for i in range(len(seq)):
                    if i in posns2:
                        sys.stdout.write('Z')
                    else:
                        sys.stdout.write(' ')
                sys.stdout.write('\n')
                    
                for i in range(len(seq) - K + 1):
                    if kh.get(seq[i:i+K]) < CUTOFF:
                        sys.stdout.write('*')
                    else:
                        sys.stdout.write(' ')
                sys.stdout.write('\n')
                print ''
                
        if count > 1000:
            break
def main():
    args = get_parser().parse_args()

    # reads counting table
    ct_reads = khmer.load_counting_hash(args.ct_reads)

    # transcripts counting table
    ct_exon = khmer.load_counting_hash(args.ct_exon)

    # transcripts themselves
    transcripts = args.transcripts

    K = ct_reads.ksize()
    assert ct_exon.ksize() == K

    # build a read aligner against, well, the reads:
    aligner = khmer.ReadAligner(ct_reads, 1, 1.0)

    # pick up a list of sequences to pay attention to
    searchlist = set([ x.strip() for x in open('seq-profiles.list') ])

    # run through the transcripts.
    for record in screed.open(transcripts):
        if record.name.split(' ')[0] not in searchlist:
            continue
        print 'found!', record.name.split(' ')[0]
        
        counts = []                     # not norm by exon count
        counts2 = []                    # norm by exon count
        
        seq = record.sequence.replace('N', 'A')

        for kmer in kmers(seq, K):
            exon_count = ct_exon.get(kmer)
            if exon_count:
                count = ct_reads.get(kmer)
                
                counts.append(count)
                counts2.append(count / float(exon_count))

        filename = record.name.split(' ')[0] + '.kprofile'
        fp = open(filename, 'w')
        for n, (c1, c2) in enumerate(zip(counts, counts2)):
            print >>fp, n, c1, c2
Beispiel #36
0
def main():
    parser = argparse.ArgumentParser(description="Get reads coverage matrix")

    parser.add_argument('hashname1')
    parser.add_argument('hashname2')
    parser.add_argument('hashname3')
    parser.add_argument('file1')
    parser.add_argument('file2')
    parser.add_argument('file3')
    parser.add_argument('output')

    args = parser.parse_args()
    hashname1 = args.hashname1
    hashname2 = args.hashname2
    hashname3 = args.hashname3
    output = args.output
    file1 = args.file1
    file2 = args.file2
    file3 = args.file3
    outfp = open(output, 'w')

    print 'hashtable from', hashname1
    ht1 = khmer.load_counting_hash(hashname1)
    print 'hashtable from', hashname2
    ht2 = khmer.load_counting_hash(hashname2)
    print 'hashtable from', hashname3
    ht3 = khmer.load_counting_hash(hashname3)
    matrix1 = {}
    matrix2 = {}
    matrix3 = {}
    #    set_x = set()
    #    set_y = set()
    for file_n in [file1, file2, file3]:
        print 'reading reads file ', file_n
        for n, record in enumerate(screed.open(file_n)):
            if n > 0 and n % 100000 == 0:  #100000
                print '...', n, file_n
            seq = record.sequence.replace('N', 'A')
            med1, _, _ = ht1.get_median_count(seq)
            med2, _, _ = ht2.get_median_count(seq)
            med3, _, _ = ht3.get_median_count(seq)
            to_print = str(med1) + ' ' + str(med2) + ' ' + str(med3) + '\n'
            outfp.write(to_print)
    outfp.close()
def main():
    parser = argparse.ArgumentParser(description="Get reads coverage matrix")
    
    parser.add_argument('hashname1')
    parser.add_argument('hashname2')
    parser.add_argument('hashname3')
    parser.add_argument('file1')
    parser.add_argument('file2')
    parser.add_argument('file3')
    parser.add_argument('output')

    args = parser.parse_args()
    hashname1 = args.hashname1
    hashname2 = args.hashname2
    hashname3 = args.hashname3
    output  = args.output
    file1 = args.file1
    file2 = args.file2
    file3 = args.file3
    outfp = open(output, 'w')

    print 'hashtable from', hashname1
    ht1 = khmer.load_counting_hash(hashname1)
    print 'hashtable from', hashname2
    ht2 = khmer.load_counting_hash(hashname2)
    print 'hashtable from', hashname3
    ht3 = khmer.load_counting_hash(hashname3)
    matrix1 = {}
    matrix2 = {}
    matrix3 = {}
#    set_x = set()
#    set_y = set()
    for file_n in [file1,file2,file3]:
        print 'reading reads file ',file_n
        for n, record in enumerate(screed.open(file_n)):
            if n > 0 and n % 100000 == 0:#100000
                print '...', n, file_n
            seq = record.sequence.replace('N', 'A')
            med1, _, _ = ht1.get_median_count(seq)
            med2, _, _ = ht2.get_median_count(seq)
            med3, _, _ = ht3.get_median_count(seq)
            to_print = str(med1)+' '+str(med2)+' ' +str(med3)+'\n'
            outfp.write(to_print)
    outfp.close()
def main():
    parser = argparse.ArgumentParser(description="Output k-mer abundance distribution.")
    
    parser.add_argument('hashname')
    parser.add_argument('datafile')
    parser.add_argument('histout')

    parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-s', '--squash', dest='squash_output', default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')

    args = parser.parse_args()
    hashfile = args.hashname
    datafile = args.datafile
    histout = args.histout

    print 'hashtable from', hashfile
    ht = khmer.load_counting_hash(hashfile)

    K = ht.ksize()
    sizes = ht.hashsizes()
    tracking = khmer._new_hashbits(K, sizes)

    print 'K:', K
    print 'HT sizes:', sizes
    print 'outputting to', histout

    if os.path.exists(histout):
        if not args.squash_output:
            print >>sys.stderr, 'ERROR: %s exists; not squashing.' % histout
            sys.exit(-1)
        
        print '** squashing existing file %s' % histout

    print 'preparing hist...'
    z = ht.abundance_distribution(datafile, tracking)
    total = sum(z)
        
    fp = open(histout, 'w')

    sofar = 0
    for n, i in enumerate(z):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        print >>fp, n, i, sofar, round(frac, 3)

        if sofar == total:
            break
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    parser.add_argument('--trusted', type=int, default=5)
    parser.add_argument('--variants-out', type=str, default='variants.txt',
                        dest='variants_out')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.ReadAligner(ct, args.trusted, 1.0)

    i = 0
    for record in screed.open(args.ref):
        i += 1
        if i > 50:
           break
        seq = record.sequence
        seq = seq.replace('N', 'A')

        try:
            score, alignment = align_long(ct, aligner, seq)
        except:
            traceback.print_exc()
            continue

        g = alignment.g
        r = alignment.r

        m, n = alignment.compare()
        print record.name, m, n, n - m, "%.3f%%" % (float(m)/ n * 100)
        for start in range(0, len(alignment), 60):
            print start
            print alignment[start:start+60]

        gidx = AlignmentIndex(alignment)
        fp = open(args.variants_out, 'w')

        for gi, a, b in alignment.variants():
            kmer = ''
            pos = gi
            while len(kmer) < ct.ksize() and pos < len(alignment.g):
                ch = alignment.g[pos]
                pos += 1
                if ch in '=-':
                    continue
                kmer += ch

            if alignment.covs[gi]:
                print >>fp, gi, a, b, gidx.get_ri(gi), kmer, alignment.covs[gi]

        if 0:
            print len(seq), alignment.refseqlen()
            gidx._sanityCheck(seq)
Beispiel #40
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--coverage',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print('file with ht: %s' % counting_ht)

    print('loading hashtable')
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    xxxfp = None

    print("K:", K)

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)
        pct = dev / avg * 100

        xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name))

        if random.randint(1, med) > args.coverage or pct > 100:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print('filtering', infile)
        xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w')
        outfile = os.path.basename(infile) + '.medpctfilt'
        outfp = open(outfile, 'w')

        for n, record in enumerate(screed.open(infile)):
            if n % 100000 == 0:
                print('...', n)

            name, seq = process_fn(record)
            if name and seq:
                print('>%s\n%s' % (name, seq), file=outfp)

        print('output in', outfile)
Beispiel #41
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--min-coverage', type=int, default=None)
    parser.add_argument('-M', '--max-coverage', type=int, default=None)
    parser.add_argument('input_counting_table')
    parser.add_argument('input_readfile')
    parser.add_argument('output_readfile')
    args = parser.parse_args()

    print >>sys.stderr, 'min_coverage: %s' % args.min_coverage
    print >>sys.stderr, 'max_coverage: %s' % args.max_coverage

    if not (args.min_coverage or args.max_coverage):
        print >>sys.stderr, "neither min nor max coverage specified!? exiting!"
        sys.exit(1)

    if args.min_coverage and args.max_coverage and \
       args.max_coverage < args.min_coverage:
        print >>sys.stderr, "min_coverage > max_coverage!? exiting!"
        sys.exit(1)

    htable = khmer.load_counting_hash(args.input_counting_table)
    output_file = args.output_readfile
    output_fp = open(output_file, 'w')

    n_kept = 0
    n = 0
    for n, record in enumerate(screed.open(args.input_readfile)):
        if n % 100000 == 0:
            print >>sys.stderr, '...', n, n_kept

        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'G')

        try:
            med, _, _ = htable.get_median_count(seq)
        except ValueError:
            continue

        keep = True
        if args.min_coverage and med < args.min_coverage:
            keep = False

        if args.max_coverage and med > args.max_coverage:
            keep = False

        if keep:
            n_kept += 1

            output_fp.write(output_single(record))

    print >>sys.stderr, 'consumed %d reads; kept %d' % (n, n_kept)
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int,
                        help="Diginorm coverage.")
    parser.add_argument('--max-error-region', '-M', dest='max_error_region',
                        default=DEFAULT_MAX_ERROR_REGION, type=int,
                        help="Max length of error region allowed")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.corr'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
def main():
    parser = argparse.ArgumentParser(description="Get reads coverage matrix")
    
    parser.add_argument('hashname1')
    parser.add_argument('hashname2')
    parser.add_argument('file1')
    parser.add_argument('file2')
    parser.add_argument('output')

    args = parser.parse_args()
    hashname1 = args.hashname1
    hashname2 = args.hashname2
    output  = args.output
    file1 = args.file1
    file2 = args.file2
    outfp = open(output, 'w')

    print 'hashtable from', hashname1
    ht1 = khmer.load_counting_hash(hashname1)
    ht2 = khmer.load_counting_hash(hashname2)

    for n, record in enumerate(screed.open(file1)):
        if n > 0 and n % 100000 == 0:#100000
            print '...', n, file1
        seq = record.sequence.replace('N', 'A')
        med1, _, _ = ht1.get_median_count(seq)
        med2, _, _ = ht2.get_median_count(seq)
        to_print = record.name+' '+str(med1)+' '+str(med2)+'\n'
        outfp.write(to_print)

    for n, record in enumerate(screed.open(file2)):
        if n > 0 and n % 100000 == 0:#100000
            print '...', n, file2
        seq = record.sequence.replace('N', 'A')
        med1, _, _ = ht1.get_median_count(seq)
        med2, _, _ = ht2.get_median_count(seq)
        to_print = record.name+' '+str(med1)+' '+str(med2)+'\n'
        outfp.write(to_print)

    outfp.close()
def main():
    parser = argparse.ArgumentParser(description="Get reads coverage matrix")

    parser.add_argument('hashname1')
    parser.add_argument('hashname2')
    parser.add_argument('file1')
    parser.add_argument('file2')
    parser.add_argument('output')

    args = parser.parse_args()
    hashname1 = args.hashname1
    hashname2 = args.hashname2
    output = args.output
    file1 = args.file1
    file2 = args.file2
    outfp = open(output, 'w')

    print 'hashtable from', hashname1
    ht1 = khmer.load_counting_hash(hashname1)
    ht2 = khmer.load_counting_hash(hashname2)

    for n, record in enumerate(screed.open(file1)):
        if n > 0 and n % 100000 == 0:  #100000
            print '...', n, file1
        seq = record.sequence.replace('N', 'A')
        med1, _, _ = ht1.get_median_count(seq)
        med2, _, _ = ht2.get_median_count(seq)
        to_print = record.name + ' ' + str(med1) + ' ' + str(med2) + '\n'
        outfp.write(to_print)

    for n, record in enumerate(screed.open(file2)):
        if n > 0 and n % 100000 == 0:  #100000
            print '...', n, file2
        seq = record.sequence.replace('N', 'A')
        med1, _, _ = ht1.get_median_count(seq)
        med2, _, _ = ht2.get_median_count(seq)
        to_print = record.name + ' ' + str(med1) + ' ' + str(med2) + '\n'
        outfp.write(to_print)

    outfp.close()
Beispiel #45
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    check_input_files(args.input_table, args.force)
    infiles = args.input_filename
    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    print('loading counting table:', args.input_table,
          file=sys.stderr)
    htable = khmer.load_counting_hash(args.input_table)
    ksize = htable.ksize()

    print("K:", ksize, file=sys.stderr)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Beispiel #46
0
def main():
    files = sys.argv[2:]

    total_reads = len(files) * [0]
    n_consumed = len(files) * [0]
    n_seq_kept = len(files) * [0]

    print('loading ht')
    ht = khmer.load_counting_hash(sys.argv[1])

    for i, infile in enumerate(files):
        print('outputting', infile + '.freq')
        ht.output_fasta_kmer_pos_freq(infile, infile + ".freq")
Beispiel #47
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    counting_ht = args.input_table
    infiles = args.input_filename

    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print 'loading hashtable'
    htable = khmer.load_counting_hash(counting_ht)
    ksize = htable.ksize()

    print "K:", ksize

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('sequences')
    parser.add_argument('-C', '--cutoff', default=3, type=int)
    parser.add_argument('--coverage', default=20, type=int)
    parser.add_argument('-V', '--variable', default=False, action='store_true')
    parser.add_argument('-o',
                        '--outfile',
                        type=argparse.FileType('w'),
                        default=sys.stdout)

    args = parser.parse_args()

    kh = khmer.load_counting_hash(args.table)
    n_skipped_variable = 0
    n_total = 0

    print >> sys.stderr, "K:", kh.ksize()
    print >> sys.stderr, "CUTOFF:", args.cutoff
    if args.variable:
        print >> sys.stderr, "variable coverage flag set;"
        print >> sys.stderr, "NORMALIZE_LIMIT:", args.coverage
    else:
        print >> sys.stderr, "assuming even coverage - no -V"

    for n, record in enumerate(screed.open(args.sequences)):
        if n % 100000 == 0:
            print >> sys.stderr, '...', n
        seq = record.sequence.replace('N', 'A')

        n_total += 1

        varskip = False
        if args.variable:
            med, _, _ = kh.get_median_count(seq)
            if med < args.coverage:
                varskip = True
                n_skipped_variable += 1

        if varskip:
            print >> args.outfile, record.name, 'V'
        else:
            #posns = find_spectral_error_positions(kh, seq, args.cutoff)
            posns = kh.find_spectral_error_positions(seq, args.cutoff)
            posns = add_n_posns(posns, record.sequence)
            print >> args.outfile, record.name, ",".join(map(str, posns))

    if args.variable:
        sys.stderr.write('Skipped %d reads of %d total due to -V\n' % \
                         (n_skipped_variable, n_total))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('ref')
    parser.add_argument('--trusted', type=int, default=5)
    parser.add_argument('--variants-out',
                        type=str,
                        default='variants.txt',
                        dest='variants_out')
    args = parser.parse_args()

    ct = khmer.load_counting_hash(args.table)
    aligner = khmer.ReadAligner(ct, args.trusted, 1.0)

    for record in screed.open(args.ref):
        seq = record.sequence
        seq = seq.replace('N', 'A')

        score, alignment = align_long(ct, aligner, seq)

        g = alignment.g
        r = alignment.r

        m, n = alignment.compare()
        print record.name, m, n, n - m, "%.3f%%" % (float(m) / n * 100)
        for start in range(0, len(alignment), 60):
            print start
            print alignment[start:start + 60]

        gidx = AlignmentIndex(alignment)
        fp = open(args.variants_out, 'w')

        for gi, a, b in alignment.variants():
            kmer = ''
            pos = gi
            while len(kmer) < ct.ksize() and pos < len(alignment.g):
                ch = alignment.g[pos]
                pos += 1
                if ch in '=-':
                    continue
                kmer += ch

            if alignment.covs[gi]:
                print >> fp, gi, a, b, gidx.get_ri(
                    gi), kmer, alignment.covs[gi]

        if 0:
            print len(seq), alignment.refseqlen()
            gidx._sanityCheck(seq)
def main():
    hashfile = sys.argv[1]
    filename = sys.argv[2]
    outfile = os.path.basename(filename)

    print 'loading kh file', hashfile
    ht = khmer.load_counting_hash(hashfile)

    x = ht.fasta_count_kmers_by_position(filename, 100, 1)
    write_dist(x, open(outfile + '.pos.abund=1', 'w'))
    print 'wrote', outfile + '.pos.abund=1'

    y = ht.fasta_count_kmers_by_position(filename, 100, 255)
    write_dist(y, open(outfile + '.pos.abund=255', 'w'))
    print 'wrote', outfile + '.pos.abund=255'
def main():
    parser = argparse.ArgumentParser(
        description="Output k-mer abundance distribution.")

    parser.add_argument('hashname')
    parser.add_argument('seqfile')
    parser.add_argument('histout')

    args = parser.parse_args()
    hashfile = args.hashname
    seqfile = args.seqfile
    histout = args.histout

    outfp = open(histout, 'w')

    print('hashtable from', hashfile)
    ht = khmer.load_counting_hash(hashfile)

    hist = {}

    for i in range(65536):
        hist[i] = 0

    for n, record in enumerate(screed.open(seqfile)):
        if n > 0 and n % 100000 == 0:
            print('...', n)

        seq = record.sequence.replace('N', 'A')

        try:
            med, _, _ = ht.get_median_count(seq)
        except ValueError:
            continue

        hist[med] = hist[med] + 1

    histlist = list(hist.items())
    histlist.sort()

    maxk = max(hist.keys())
    sumk = sum(hist.values())

    sofar = 0
    for n, m in histlist:
        sofar += m
        percent = float(sofar) / sumk
        outfp.write('%d %d %d %.3f\n' % (n, m, sofar, percent))
    outfp.close()
Beispiel #52
0
    def do_test(ctfile):
        inpath = utils.get_test_data('random-20-a.fa')
        savepath = utils.get_temp_filename(ctfile)

        sizes = khmer.get_n_primes_above_x(1, 2**31)

        orig = khmer.CountingHash(12, sizes)
        orig.consume_fasta(inpath)
        orig.save(savepath)

        loaded = khmer.load_counting_hash(savepath)

        orig_count = orig.n_occupied()
        loaded_count = loaded.n_occupied()
        assert orig_count == 3966, orig_count
        assert loaded_count == orig_count, loaded_count
Beispiel #53
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-o', '--outputpath', dest='outputpath', default='.')
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames
    outpath = args.outputpath

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile