def run_no_curve(K,HT_SIZE,N_HT,filename,filename2,file_result):    
    file_result_object = open(file_result,'w')
    
    
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
    
    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht.get(kmer)):
                n_unique+=1
            ht.count(kmer)
    print filename,'has been consumed.'        
    print '# of unique kmers:',n_unique
    print '# of occupied bin:',ht.n_occupied()
    printout = filename+":"+'\n'
    printout =  printout+'# of unique kmers:'+str(n_unique)+'\n'
    printout = printout + '# of occupied bin:'+str(ht.n_occupied())+'\n'
    
    
    
    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    for n, record in enumerate(fasta_iter(open(filename2))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht2.get(kmer)):
                n_unique+=1
                if (ht.get(kmer)):
                    n_overlap+=1
            ht2.count(kmer)
            
    print filename2,'has been consumed.'        
    print '# of unique kmers:',n_unique
    print '# of occupied bin:',ht2.n_occupied()
    
    print n_overlap,'unique kmers appears in both ',filename,' and ',filename2
    
    
    printout = printout+filename2+":"+'\n'
    printout =  printout+'# of unique kmers:'+str(n_unique)+'\n'
    printout = printout + '# of occupied bin:'+str(ht2.n_occupied())+'\n'
    printout = printout + '# of overlap unique kmers:' + str(n_overlap) + '\n'
    
    file_result_object.write(printout)
Example #2
0
def test_bloom_python_1():
    # test python code to count unique kmers using bloom filter
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht2.get(kmer)):
                n_unique += 1
            ht2.count(kmer)

    assert n_unique == 3960
    assert ht2.n_occupied() == 3885, ht2.n_occupied()

    # this number equals n_unique
    assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers()
Example #3
0
def test_hll_add_python():
    # test python code to count unique kmers using HyperLogLog.
    # use the lower level add() method, which accepts anything,
    # and compare to an exact count using collections.Counter

    filename = utils.get_test_data('random-20-a.fa')
    hllcpp = khmer.HLLCounter(ERR_RATE, K)
    counter = set()

    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            rc = "".join(TRANSLATE[c] for c in kmer[::-1])

            hllcpp.add(kmer)

            if rc in counter:
                kmer = rc
            counter.update([kmer])

    n_unique = len(counter)

    assert n_unique == N_UNIQUE
    assert abs(1 - float(hllcpp.estimate_cardinality()) / N_UNIQUE) < ERR_RATE
Example #4
0
def test_bloom_python_1():
    # test python code to count unique kmers using bloom filter
    filename = utils.get_test_data("random-20-a.fa")

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record["sequence"]
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n : n + K]
            if not ht2.get(kmer):
                n_unique += 1
            ht2.count(kmer)

    assert n_unique == 3960
    assert ht2.n_occupied() == 3885, ht2.n_occupied()

    # this number equals n_unique
    assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers()
Example #5
0
def test_bloom_python_1():
    # test python code to count unique kmers using bloom filter
    filename = utils.get_test_data('random-20-a.fa')

    ksize = 20  # size of kmer
    htable_size = 100000  # size of hashtableable
    num_htableables = 3  # number of hashtableables

    htableable = khmer.Hashbits(ksize, htable_size, num_htableables)

    n_unique = 0
    for _, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - ksize):
            kmer = sequence[n:n + ksize]
            if not htableable.get(kmer):
                n_unique += 1
            htableable.count(kmer)

    assert n_unique == 3960
    assert htableable.n_occupied() == 3885, htableable.n_occupied()

    # this number equals n_unique
    assert htableable.n_unique_kmers() == 3960, htableable.n_unique_kmers()
Example #6
0
def verbose_fasta_iter(filename):
    from screed.fasta import fasta_iter
    it = fasta_iter(open(filename))
    for n, record in enumerate(it):
        if n % 10000 == 0:
            print >>sys.stderr, '... filtering', n
        yield record
def test_hll_add_python():
    # test python code to count unique kmers using HyperLogLog.
    # use the lower level add() method, which accepts anything,
    # and compare to an exact count using collections.Counter

    filename = utils.get_test_data('random-20-a.fa')
    hllcpp = khmer.HLLCounter(ERR_RATE, K)
    counter = set()

    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            rc = "".join(TRANSLATE[c] for c in kmer[::-1])

            hllcpp.add(kmer)

            if rc in counter:
                kmer = rc
            counter.update([kmer])

    n_unique = len(counter)

    assert n_unique == N_UNIQUE
    assert abs(1 - float(hllcpp.estimate_cardinality()) / N_UNIQUE) < ERR_RATE
Example #8
0
def main():
    outfp = open(sys.argv[2], 'w')

    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
    ht.consume_fasta(sys.argv[1])

    hist = [0] * 200
    histcount = [0] * 200
    for n, record in enumerate(fasta_iter(open(sys.argv[1]))):
        if n % 10000 == 0:
            print '...', n

        seq = record['sequence']
        for pos in range(0, len(seq) - K + 1):
            kmer = seq[pos:pos + K]
            count = ht.kmer_degree(kmer)

            hist[pos] += count
            histcount[pos] += 1

    for i in range(len(hist)):
        total = hist[i]
        count = histcount[i]
        if not count:
            continue

        print >> outfp, i, total, count, total / float(count)
def assemble_sequences(f, k, length_cutoff=LENGTH_CUTOFF):
    try:
        seqfile = f
        #dirname = os.path.dirname(os.path.abspath(f))
        dirname = tempfile.mkdtemp()

        assemble_dir = os.path.join(dirname, 'assemble')
        p = subprocess.Popen('velveth %s %d -shortPaired %s.pe -short %s.se' % (assemble_dir, k, seqfile, seqfile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdout, stderr) = p.communicate()
        assert p.returncode == 0, (stdout, stderr)

        p = subprocess.Popen('velvetg %s -read_trkg yes -exp_cov auto -cov_cutoff 0' % (assemble_dir,),
                             shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdout, stderr) = p.communicate()
        assert p.returncode == 0, (stdout, stderr)

        x = []
        total = 0
        for r in fasta_iter(open(os.path.join(assemble_dir, 'contigs.fa'))):
            seqlen = len(r['sequence'])
            if seqlen >= length_cutoff:
                x.append(r)
                total += seqlen

        return total, x
    finally:
        pass
        shutil.rmtree(dirname)
def assemble_sequences(f, k, length_cutoff=LENGTH_CUTOFF):
    try:
        seqfile = f
        #dirname = os.path.dirname(os.path.abspath(f))
        dirname = tempfile.mkdtemp()

        assemble_dir = os.path.join(dirname, 'assemble')
        p = subprocess.Popen('bash %s %s %d %s' % (SGA_PIPE, seqfile, k, assemble_dir), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print 'bash %s %s %d %s' % (SGA_PIPE, seqfile, k, assemble_dir)
        (stdout, stderr) = p.communicate()
        assert p.returncode == 0, (stdout, stderr)

        x = []
        total = 0
        print os.listdir(assemble_dir)
        for r in fasta_iter(open(os.path.join(assemble_dir, '%s.sga.%d-contigs.fa' %(os.path.basename(f), k)))):
            seqlen = len(r['sequence'])
            if seqlen >= length_cutoff:
                x.append(r)
                total += seqlen

        return total, x
    finally:
        pass
        shutil.rmtree(dirname)
Example #11
0
def test_bloom_python_1():
    # test python code to count unique kmers using bloom filter
    filename = utils.get_test_data('random-20-a.fa')

    ksize = 20  # size of kmer
    htable_size = 100000  # size of hashtableable
    num_htableables = 3  # number of hashtableables

    htableable = khmer.Hashbits(ksize, htable_size, num_htableables)

    n_unique = 0
    for _, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - ksize):
            kmer = sequence[n:n + ksize]
            if not htableable.get(kmer):
                n_unique += 1
            htableable.count(kmer)

    assert n_unique == 3960
    assert htableable.n_occupied() == 3885, htableable.n_occupied()

    # this number equals n_unique
    assert htableable.n_unique_kmers() == 3960, htableable.n_unique_kmers()
Example #12
0
def run_no_curve(K, HT_SIZE, N_HT, filename, filename2, file_result):
    file_result_object = open(file_result, 'w')

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht.get(kmer)):
                n_unique += 1
            ht.count(kmer)
    print filename, 'has been consumed.'
    print '# of unique kmers:', n_unique
    print '# of occupied bin:', ht.n_occupied()
    printout = filename + ":" + '\n'
    printout = printout + '# of unique kmers:' + str(n_unique) + '\n'
    printout = printout + '# of occupied bin:' + str(ht.n_occupied()) + '\n'

    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    for n, record in enumerate(fasta_iter(open(filename2))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht2.get(kmer)):
                n_unique += 1
                if (ht.get(kmer)):
                    n_overlap += 1
            ht2.count(kmer)

    print filename2, 'has been consumed.'
    print '# of unique kmers:', n_unique
    print '# of occupied bin:', ht2.n_occupied()

    print n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2

    printout = printout + filename2 + ":" + '\n'
    printout = printout + '# of unique kmers:' + str(n_unique) + '\n'
    printout = printout + '# of occupied bin:' + str(ht2.n_occupied()) + '\n'
    printout = printout + '# of overlap unique kmers:' + str(n_overlap) + '\n'

    file_result_object.write(printout)
Example #13
0
def count_sum_contigs(cutoff, filename):
    total = 0
    for record in fasta_iter(open(filename)):
        seqlen = len(record['sequence'])
        if seqlen >= cutoff:
            total += seqlen

    return total
Example #14
0
def count_sum_contigs(cutoff, filename):
    total = 0
    for record in fasta_iter(open(filename)):
        seqlen = len(record['sequence'])
        if seqlen >= cutoff:
            total += seqlen

    return total
Example #15
0
def main():
    '''
    Usage: python <thisfile> <infile> length numseq2keep tag <outfile>
    '''
    if len(sys.argv) != 6:
        mes = ('Usage: python {} <infile> length numseq2keep tag <outfile>\n'
                '*** tag can be used screen OUT seq names\n')
        print >> sys.stderr, mes.format(os.path.basename(sys.argv[0]))
        sys.exit(1)

    infile = sys.argv[1]
    length = int(sys.argv[2])
    num = int(sys.argv[3])
    tag = sys.argv[4]
    outfile = sys.argv[5]

    try:
        if infile == '-':
            fp = sys.stdin
        else:
            fp = open(infile)

        if outfile == '-':
            fw = sys.stdout
        else:
            fw = open(outfile, 'wb')

        for n, record in enumerate(fasta.fasta_iter(fp)):
            if n == num:
                break
            name = record['name']
            seq = record['sequence']

            if len(seq) < length:
                continue

            if tag in name:
                continue

            new_seq = seq[:length]

            fw.write('>{}\n{}\n'.format(name, new_seq)) #fasta output

        try:
            n
        except NameError:
            print >> sys.stderr, '*** No seqs are in seqfile'

        if n < num:
            mes = '*** Not enough seqs in {} ({} < {}), only {} subsampled'
            print >> sys.stderr, mes.format(os.path.basename(infile),
                                              n, num, n)

    except IOError as err:
        print >> sys.stderr, err
        fw.close()
        sys.exit(1)
Example #16
0
def test_hll_consume_string():
    # test c++ code to count unique kmers using HyperLogLog,
    # using screed to feed each read to the counter.

    filename = utils.get_test_data('random-20-a.fa')
    hllcpp = khmer.HLLCounter(ERR_RATE, K)
    for n, record in enumerate(fasta_iter(open(filename))):
        hllcpp.consume_string(record['sequence'])

    assert abs(1 - float(hllcpp.estimate_cardinality()) / N_UNIQUE) < ERR_RATE
Example #17
0
def load_fa_seq_names(filename):
    try:
       fasta_iter
    except NameError:
       raise nose.SkipTest

    fp = open(filename)
    records = list(fasta_iter(fp))
    names = [ r['name'] for r in records ]
    return names
Example #18
0
def test_hll_consume_string():
    # test c++ code to count unique kmers using HyperLogLog,
    # using screed to feed each read to the counter.

    filename = utils.get_test_data('random-20-a.fa')
    hllcpp = khmer.HLLCounter(ERR_RATE, K)
    for n, record in enumerate(fasta_iter(open(filename))):
        hllcpp.consume_string(record['sequence'])

    assert abs(1 - float(hllcpp.estimate_cardinality()) / N_UNIQUE) < ERR_RATE
Example #19
0
def main():
    d = dict([(r['name'], r['sequence']) for r in fasta_iter(open(sys.argv[1]))])

    ks = d.keys()
    random.shuffle(ks)

    for k in ks:
        s = d[k]

        print '>%s\n%s' % (k, s)
Example #20
0
def main():
    filename = sys.argv[1]
    K = int(sys.argv[2])  # size of kmer
    HT_SIZE = int(sys.argv[3])  # size of hashtable
    N_HT = int(sys.argv[4])  # number of hashtables

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht.get(kmer)):
                n_unique += 1
            ht.count(kmer)
    print(filename, 'has been consumed.')
    print('# of unique kmers:', n_unique)
    print('# of occupied bin:', ht.n_occupied())

    filename2 = sys.argv[5]
    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    for n, record in enumerate(fasta_iter(open(filename2))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht2.get(kmer)):
                n_unique += 1
                if (ht.get(kmer)):
                    n_overlap += 1
            ht2.count(kmer)

    print(filename2, 'has been consumed.')
    print('# of unique kmers:', n_unique)
    print('# of occupied bin:', ht2.n_occupied())

    print(n_overlap, 'unique kmers appears in both ', filename, ' and ',
          filename2)
def main():
    filename = sys.argv[1]
    K = int(sys.argv[2])  # size of kmer
    HT_SIZE = int(sys.argv[3])  # size of hashtable
    N_HT = int(sys.argv[4])  # number of hashtables

    ht = khmer.Hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record["sequence"]
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n : n + K]
            if not ht.get(kmer):
                n_unique += 1
            ht.count(kmer)
    print(filename, "has been consumed.")
    print("# of unique kmers:", n_unique)
    print("# of occupied bin:", ht.n_occupied())

    filename2 = sys.argv[5]
    ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    for n, record in enumerate(fasta_iter(open(filename2))):
        sequence = record["sequence"]
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n : n + K]
            if not ht2.get(kmer):
                n_unique += 1
                if ht.get(kmer):
                    n_overlap += 1
            ht2.count(kmer)

    print(filename2, "has been consumed.")
    print("# of unique kmers:", n_unique)
    print("# of occupied bin:", ht2.n_occupied())

    print(n_overlap, "unique kmers appears in both ", filename, " and ", filename2)
def main():
    filename = sys.argv[1]
    K = int(sys.argv[2])  # size of kmer
    HT_SIZE = int(sys.argv[3])  # size of hashtable
    N_HT = int(sys.argv[4])  # number of hashtables

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht.get(kmer)):
                n_unique += 1
            ht.count(kmer)
    print filename, 'has been consumed.'
    print '# of unique kmers:', n_unique
    print '# of occupied bin:', ht.n_occupied()

    filename2 = sys.argv[5]
    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    for n, record in enumerate(fasta_iter(open(filename2))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht2.get(kmer)):
                n_unique += 1
                if (ht.get(kmer)):
                    n_overlap += 1
            ht2.count(kmer)

    print filename2, 'has been consumed.'
    print '# of unique kmers:', n_unique
    print '# of occupied bin:', ht2.n_occupied()

    print n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2
Example #23
0
def cutMSA(fp, start, end):
    """
    Cut a region in Multiple Sequence Alignment based on the start and end 
    positions on template

    Parameters:
    -----------
    fp : file object
        file object of aligned sequence file (.afa)
    start : int
        start position of target region
    end : int
        end position of end region

    Returns:
    --------
    tuple
        a tuple with tow items. First item is target region in tempalte 
        and second item is a dictionary with sequence name as key 
        and target region of that sequence as value

    """
    fw = open('%s.%sto%s.cut' % (sys.argv[1], start, end), 'w')
    refName, template, profile = getRef(fp, 1)

    length = len(profile)
    for i in range(length):
        if profile[i] == 0:
            continue
        j = sum(profile[:i + 1])
        if j == int(start):
            start1 = i
        if j == int(end):
            end1 = i
            break

    print >> fw, '>%s\n%s' % (refName, template[start1:(end1 + 1)])

    rows = {}  #ref seq not included
    fp.seek(0, 0)
    for record in fasta.fasta_iter(fp):
        name = record['name']
        if 'ReFeReNcE' in name:
            continue
        seq = record['sequence']
        assert len(seq) == length, 'not afa format'

        subSeq = seq[start1:(end1 + 1)]
        rows[name] = subSeq
        print >> fw, '>%s\n%s' % (name, subSeq)

    return template[start1:(end1 + 1)], rows
def process_file(filename,HT_SIZE_array):

    N_HT = 4
    K = 12

    list_average_miscount = []
    list_average_miscount_perc = []
    list_fp_miscount0 = []

    print filename
    for HT_SIZE in HT_SIZE_array:
        print HT_SIZE
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
        ht.consume_fasta(filename)
                
        ktable = khmer.new_ktable(K)
        for n, record in enumerate(fasta_iter(open(filename))):
            sequence = record['sequence']
#            ktable.consume(sequence)

            seq_len = len(sequence)
            for n in range(0,seq_len+1-K):
                kmer = sequence[n:n+K]
                ktable.count(kmer)

        list_miscount = []
        list_miscount_perc = []
        total_kmer = 0 # total number of unique k-mers
        miscount0 = 0
        
        for i in range(0, ktable.n_entries()):
            n = ktable.get(i)
            if n:
                total_kmer = total_kmer + 1
                kmer2 = ktable.reverse_hash(i)
                miscount = ht.get(kmer2) - ktable.get(kmer2)######
                miscount_perc = miscount/ktable.get(kmer2)
                list_miscount.append(miscount)
                list_miscount_perc.append(miscount_perc)
                if miscount > 0:
                    miscount0 = miscount0 + 1

        average_miscount = float(sum(list_miscount))/len(list_miscount)
        list_average_miscount.append(average_miscount)
        average_miscount_perc = float(sum(list_miscount_perc))/len(list_miscount_perc)
        list_average_miscount_perc.append(average_miscount_perc)
        
        fp_miscount0 = float(miscount0)/total_kmer
        list_fp_miscount0.append(fp_miscount0)

    to_return = [list_average_miscount,list_fp_miscount0,total_kmer,list_average_miscount_perc]
    return to_return
Example #25
0
def cutMSA(fp, start, end):
    """
    Cut a region in Multiple Sequence Alignment based on the start and end 
    positions on template

    Parameters:
    -----------
    fp : file object
        file object of aligned sequence file (.afa)
    start : int
        start position of target region
    end : int
        end position of end region

    Returns:
    --------
    tuple
        a tuple with tow items. First item is target region in tempalte 
        and second item is a dictionary with sequence name as key 
        and target region of that sequence as value

    """
    fw = open("%s.%sto%s.cut" % (sys.argv[1], start, end), "w")
    refName, template, profile = getRef(fp, 1)

    length = len(profile)
    for i in range(length):
        if profile[i] == 0:
            continue
        j = sum(profile[: i + 1])
        if j == int(start):
            start1 = i
        if j == int(end):
            end1 = i
            break

    print >> fw, ">%s\n%s" % (refName, template[start1 : (end1 + 1)])

    rows = {}  # ref seq not included
    fp.seek(0, 0)
    for record in fasta.fasta_iter(fp):
        name = record["name"]
        if "ReFeReNcE" in name:
            continue
        seq = record["sequence"]
        assert len(seq) == length, "not afa format"

        subSeq = seq[start1 : (end1 + 1)]
        rows[name] = subSeq
        print >> fw, ">%s\n%s" % (name, subSeq)

    return template[start1 : (end1 + 1)], rows
Example #26
0
def test_filter_if_present():
    ht = khmer.LabelHash(32, 1e6, 2)

    maskfile = utils.get_test_data('filter-test-A.fa')
    inputfile = utils.get_test_data('filter-test-B.fa')
    outfile = utils.get_temp_filename('filter')

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]['name'] == '3'
Example #27
0
def main(contig_filename, read_filenames_list, output_filename):

    ht = khmer.new_counting_hash(K, HASHTABLE_SIZE, N_HT)

    '''consumes contig into hashtable'''
    for n, record in enumerate(fasta_iter(open(contig_filename))):
        sequence = record['sequence']
        contig_kmers = slidingWindow(sequence, K)
        for x in contig_kmers:
            if x.find('N') > 0:
                continue
            else:
                ht.consume(x)

    '''counts reads into hashtable abundance'''
    for each_file in read_filenames_list:
        read_file = open(each_file, 'r')
        for n1, record1 in enumerate(fasta_iter(read_file)):
            sequence = record1['sequence']
            read_kmers = slidingWindow(sequence, K)
            for kmer in read_kmers:
                if ht.get(kmer) > 0:
                    ht.count(kmer)
        read_file.close()

    '''retrieve abundances'''
    for n2, record2 in enumerate(fasta_iter(open(contig_filename))):
        contig_seq = record2['sequence']
        count_list = []
        contig_kmers = slidingWindow(contig_seq, K)
        for contig_kmer in contig_kmers:
            count_kmer = int(ht.get(contig_kmer)) - 1
            count_list.append(count_kmer)

    fp = open(output_filename, 'w')
    for item in count_list:
        print >>fp, '%s' % item

    print 'Hashtable occupancy =', ht.n_occupied() / float(HASHTABLE_SIZE)
Example #28
0
def test_filter_if_present():
    ht = khmer.new_hashbits(32, 1e6, 2)

    maskfile = os.path.join(thisdir, "test-data", "filter-test-A.fa")
    inputfile = os.path.join(thisdir, "test-data", "filter-test-B.fa")
    outfile = os.path.join(thisdir, "test-data", "filter-test-C.fa")

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]["name"] == "3"
Example #29
0
def test_filter_if_present():
    ht = khmer.Hashbits(32, 1e6, 2)

    maskfile = utils.get_test_data("filter-test-A.fa")
    inputfile = utils.get_test_data("filter-test-B.fa")
    outfile = utils.get_temp_filename("filter")

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]["name"] == "3"
Example #30
0
def main(contig_filename, read_filenames_list, output_filename):

    ht = khmer.new_counting_hash(K, HASHTABLE_SIZE, N_HT)

    '''consumes contig into hashtable'''
    for n, record in enumerate(fasta_iter(open(contig_filename))):
        sequence = record['sequence']
        contig_kmers = slidingWindow(sequence, K)
        for x in contig_kmers:
            if x.find('N') > 0:
                continue
            else:
                ht.consume(x)

    '''counts reads into hashtable abundance'''
    for each_file in read_filenames_list:
        read_file = open(each_file, 'r')
        for n1, record1 in enumerate(fasta_iter(read_file)):
            sequence = record1['sequence']
            read_kmers = slidingWindow(sequence, K)
            for kmer in read_kmers:
                if ht.get(kmer) > 0:
                    ht.count(kmer)
        read_file.close()

    '''retrieve abundances'''
    for n2, record2 in enumerate(fasta_iter(open(contig_filename))):
        contig_seq = record2['sequence']
        count_list = []
        contig_kmers = slidingWindow(contig_seq, K)
        for contig_kmer in contig_kmers:
            count_kmer = int(ht.get(contig_kmer)) - 1
            count_list.append(count_kmer)

    fp = open(output_filename, 'w')
    for item in count_list:
        print >>fp, '%s' % item

    print 'Hashtable occupancy =', ht.n_occupied() / float(HASHTABLE_SIZE)
Example #31
0
def test_filter_if_present():
    ht = khmer.Hashbits(32, 1e6, 2)

    maskfile = utils.get_test_data('filter-test-A.fa')
    inputfile = utils.get_test_data('filter-test-B.fa')
    outfile = utils.get_temp_filename('filter')

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]['name'] == '3'
Example #32
0
def load_sequences(filename):
    d = {}
    records = list(fasta_iter(open(filename), parse_description=False))

    for r in records:
        name = r['name']
        partition = name.rsplit('\t', 1)[1]
        partition = int(partition)

        x = d.get(partition, [])
        x.append(r)
        d[partition] = x

    return len(records), d
def load_sequences(filename):
    d = {}
    records = list(fasta_iter(open(filename), parse_description=False))

    for r in records:
        name = r['name']
        partition = name.rsplit('\t', 1)[1]
        partition = int(partition)

        x = d.get(partition, [])
        x.append(r)
        d[partition] = x

    return len(records), d
Example #34
0
def getRef(fp, n_ref):
    """
    Get template sequence from .afa file and a gap profile of the aligned 
    sequences (1 is a real base and 0 is a gap)

    Parameters:
    -----------
    fp : file object
        file object of aligned sequence file (.afa)
    n_ref : int
        number of template sequence to collect

    Returns:
    --------
    str
        name of the first template sequence
    str
        aligned sequence
    list
        a gap profile of aligned sequence

    """

    refs = {}
    reads = {}
    cnt = 0
    for record in fasta.fasta_iter(fp):
        name = record['name']
        seq = record['sequence']
        if 'ReFeReNcE' in name:
            refs[name] = seq
            cnt += 1
        if cnt >= n_ref:
            break

    if cnt < n_ref:
        print 'not enough ReFeReNcE seqs'
        sys.exit(1)

    template = refs.values()[0].upper()  #use the first refSeq as template
    profile = []
    length = len(template)
    for i in range(length):
        if template[i] == 'N' or not template[i].isalpha():
            profile.append(0)
        else:
            profile.append(1)

    return name, template, profile  #return the ref seq and its mask profile
Example #35
0
def getRef(fp, n_ref):
    """
    Get template sequence from .afa file and a gap profile of the aligned 
    sequences (1 is a real base and 0 is a gap)

    Parameters:
    -----------
    fp : file object
        file object of aligned sequence file (.afa)
    n_ref : int
        number of template sequence to collect

    Returns:
    --------
    str
        name of the first template sequence
    str
        aligned sequence
    list
        a gap profile of aligned sequence

    """

    refs = {}
    reads = {}
    cnt = 0
    for record in fasta.fasta_iter(fp):
        name = record["name"]
        seq = record["sequence"]
        if "ReFeReNcE" in name:
            refs[name] = seq
            cnt += 1
        if cnt >= n_ref:
            break

    if cnt < n_ref:
        print "not enough ReFeReNcE seqs"
        sys.exit(1)

    template = refs.values()[0].upper()  # use the first refSeq as template
    profile = []
    length = len(template)
    for i in range(length):
        if template[i] == "N" or not template[i].isalpha():
            profile.append(0)
        else:
            profile.append(1)

    return name, template, profile  # return the ref seq and its mask profile
Example #36
0
def test_n_occupied_1():
    filename = os.path.join(thisdir, "test-data/random-20-a.fa")

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 1  # number of hashtables

    ### test modified c++ n_occupied code
    ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht1.consume(record["sequence"])

    # this number calculated independently
    assert ht1.n_occupied() == 3877
Example #37
0
def test_n_occupied_1():
    filename = utils.get_test_data('random-20-a.fa')

    ksize = 20  # size of kmer
    htable_size = 100000  # size of hashtableable
    num_htableables = 1  # number of hashtableables

    # test modified c++ n_occupied code
    htableable = khmer.Hashbits(ksize, htable_size, num_htableables)

    for _, record in enumerate(fasta_iter(open(filename))):
        htableable.consume(record['sequence'])

    # this number calculated independently
    assert htableable.n_occupied() == 3884, htableable.n_occupied()
Example #38
0
def test_n_occupied_1():
    filename = utils.get_test_data('random-20-a.fa')

    ksize = 20  # size of kmer
    htable_size = 100000  # size of hashtableable
    num_htableables = 1  # number of hashtableables

    # test modified c++ n_occupied code
    htableable = khmer.Hashbits(ksize, htable_size, num_htableables)

    for _, record in enumerate(fasta_iter(open(filename))):
        htableable.consume(record['sequence'])

    # this number calculated independently
    assert htableable.n_occupied() == 3884, htableable.n_occupied()
Example #39
0
def test_n_occupied_1():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 1  # number of hashtables

    # test modified c++ n_occupied code
    ht1 = khmer.LabelHash(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht1.consume(record['sequence'])

    # this number calculated independently
    assert ht1.n_occupied() == 3877
Example #40
0
def test_n_occupied_1():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 1  # number of hashtables

    # test modified c++ n_occupied code
    ht1 = khmer.Hashbits(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht1.consume(record['sequence'])

    # this number calculated independently
    assert ht1.n_occupied() == 3877
Example #41
0
def assemble_sequences(records, k, length_cutoff=1000):
    dirname = tempfile.mkdtemp()
    os.chdir(dirname)

    try:
        seqfile = os.path.join(dirname, 'seqs.fa')
        fp = open(seqfile, 'w')
        for r in records:
            fp.write('>%s\n%s\n' % (r['name'].split()[0], r['sequence']))
        fp.close()

        p = subprocess.Popen([
            'python', '/root/khmer/scripts/strip-and-split-for-assembly.py',
            'seqs.fa seqs.fa'
        ],
                             shell=True)
        p.communicate()
        assert p.returncode == 0

        assemble_dir = os.path.join(dirname, 'assemble')
        p = subprocess.Popen('velveth %s %d -shortPaired %s.pe -short %s.se' %
                             (assemble_dir, k, seqfile, seqfile),
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        (stdout, stderr) = p.communicate()
        assert p.returncode == 0, (stdout, stderr)

        p = subprocess.Popen(
            'velvetg %s -read_trkg yes -exp_cov auto -cov_cutoff 0' %
            (assemble_dir, ),
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        (stdout, stderr) = p.communicate()
        assert p.returncode == 0, (stdout, stderr)

        x = []
        total = 0
        for r in fasta_iter(open(os.path.join(assemble_dir, 'contigs.fa'))):
            seqlen = len(r['sequence'])
            if seqlen >= length_cutoff:
                x.append(r)
                total += seqlen

        return total, x
    finally:
        shutil.rmtree(dirname)
Example #42
0
def test_bloom_c_1():
    ### test c++ code to count unique kmers using bloom filter

    filename = os.path.join(thisdir, "test-data/random-20-a.fa")

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht3 = khmer.new_hashbits(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht3.consume(record["sequence"])

    assert ht3.n_occupied() == 3882
    assert ht3.n_unique_kmers() == 3960
Example #43
0
def test_bloom_c_1():
    # test c++ code to count unique kmers using bloom filter

    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht3 = khmer.Hashbits(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht3.consume(record['sequence'])

    assert ht3.n_occupied() == 3882
    assert ht3.n_unique_kmers() == 3960
Example #44
0
def test_bloom_c_1():
    # test c++ code to count unique kmers using bloom filter

    filename = utils.get_test_data('random-20-a.fa')

    ksize = 20  # size of kmer
    htable_size = 100000  # size of hashtableable
    num_htableables = 3  # number of hashtableables

    htableable = khmer.Hashbits(ksize, htable_size, num_htableables)

    for _, record in enumerate(fasta_iter(open(filename))):
        htableable.consume(record['sequence'])

    assert htableable.n_occupied() == 3885
    assert htableable.n_unique_kmers() == 3960
Example #45
0
def test_bloom_c_1():
    # test c++ code to count unique kmers using bloom filter

    filename = utils.get_test_data('random-20-a.fa')

    ksize = 20  # size of kmer
    htable_size = 100000  # size of hashtableable
    num_htableables = 3  # number of hashtableables

    htableable = khmer.Hashbits(ksize, htable_size, num_htableables)

    for _, record in enumerate(fasta_iter(open(filename))):
        htableable.consume(record['sequence'])

    assert htableable.n_occupied() == 3885
    assert htableable.n_unique_kmers() == 3960
Example #46
0
def test_bloom_c_1():
    # test c++ code to count unique kmers using bloom filter

    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht3 = khmer.LabelHash(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht3.consume(record['sequence'])

    assert ht3.n_occupied() == 3882
    assert ht3.n_unique_kmers() == 3960
Example #47
0
def main():
    '''
    Usage: python <thisfile> <infile> <outfile>
    '''
    if len(sys.argv) != 3:
        mes = ('Usage: python {} <infile> <outfile>')
        print >> sys.stderr, mes.format(os.path.basename(sys.argv[0]))
        sys.exit(1)

    infile = sys.argv[1]
    outfile = sys.argv[2]

    try:
        if infile == '-':
            fp = sys.stdin
        else:
            fp = open(infile)

        if outfile == '-':
            fw = sys.stdout
        else:
            fw = open(outfile, 'wb')

        lowcomp_fw = open('low_complexity.fa', 'wb')

        for n, record in enumerate(fasta.fasta_iter(fp)):
            name = record['name']
            seq = record['sequence']
            uniq_kmer_count = count_uniq_kmer(seq)
            if uniq_kmer_count * 1.0/(len(seq) - K + 1) < CUTOFF:
                lowcomp_fw.write('>{}\n{}\n'.format(name, seq)) #fasta output
                continue

            fw.write('>{}\n{}\n'.format(name, seq)) #fasta output

        try:
            n
        except NameError:
            print >> sys.stderr, '*** No seqs are in seqfile'

    except IOError as err:
        if outfile == '-':
            pass
        else:
            print >> sys.stderr, '*** {}'.format(err)
            sys.exit(1)
Example #48
0
def main():
    '''
    Usage: python <thisfile> <infile> <outfile>
    '''
    if len(sys.argv) != 3:
        mes = ('Usage: python {} <infile> <outfile>')
        print >> sys.stderr, mes.format(os.path.basename(sys.argv[0]))
        sys.exit(1)

    infile = sys.argv[1]
    outfile = sys.argv[2]

    try:
        if infile == '-':
            fp = sys.stdin
        else:
            fp = open(infile)

        if outfile == '-':
            fw = sys.stdout
        else:
            fw = open(outfile, 'wb')

        lowcomp_fw = open('low_complexity.fa', 'wb')

        for n, record in enumerate(fasta.fasta_iter(fp)):
            name = record['name']
            seq = record['sequence']
            uniq_kmer_count = count_uniq_kmer(seq)
            if uniq_kmer_count * 1.0 / (len(seq) - K + 1) < CUTOFF:
                lowcomp_fw.write('>{}\n{}\n'.format(name, seq))  #fasta output
                continue

            fw.write('>{}\n{}\n'.format(name, seq))  #fasta output

        try:
            n
        except NameError:
            print >> sys.stderr, '*** No seqs are in seqfile'

    except IOError as err:
        if outfile == '-':
            pass
        else:
            print >> sys.stderr, '*** {}'.format(err)
            sys.exit(1)
def assemble_sequences(records, k, length_cutoff=1000):
    dirname = tempfile.mkdtemp()
    os.chdir(dirname)

    try:
        seqfile = os.path.join(dirname, 'seqs.fa')
        fp = open(seqfile, 'w')
        for r in records:
            fp.write('>%s\n%s\n' % (r['name'].split()[0], r['sequence']))
        fp.close()

        p = subprocess.Popen(
            ['python', '/root/khmer/scripts/strip-and-split-for-assembly.py',
             'seqs.fa seqs.fa'], shell=True)
        p.communicate()
        assert p.returncode == 0

        assemble_dir = os.path.join(dirname, 'assemble')
        p = subprocess.Popen(
            'velveth %s %d -shortPaired %s.pe -short %s.se' % (
                assemble_dir, k, seqfile, seqfile),
            shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdout, stderr) = p.communicate()
        assert p.returncode == 0, (stdout, stderr)

        p = subprocess.Popen(
            'velvetg %s -read_trkg yes -exp_cov auto -cov_cutoff 0' % (
                assemble_dir,),
            shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdout, stderr) = p.communicate()
        assert p.returncode == 0, (stdout, stderr)

        x = []
        total = 0
        for r in fasta_iter(open(os.path.join(assemble_dir, 'contigs.fa'))):
            seqlen = len(r['sequence'])
            if seqlen >= length_cutoff:
                x.append(r)
                total += seqlen

        return total, x
    finally:
        shutil.rmtree(dirname)
Example #50
0
def main():
    filename = sys.argv[1]
    K = int(sys.argv[2])  # size of kmer
    HT_SIZE = int(sys.argv[3])  # size of hashtable
    N_HT = int(sys.argv[4])  # number of hashtables

    ht = khmer.Nodegraph(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht.get(kmer)):
                n_unique += 1
            ht.count(kmer)

    print(n_unique)
    print(ht.n_occupied())
    print(ht.n_unique_kmers())
Example #51
0
def main():
    ht = khmer.new_hashbits(K, 1, 1)

    ht.consume_partitioned_fasta(sys.argv[1])
    before = ht.count_partitions()

    last_name = None
    last_record = None
    for n, record in enumerate(
            fasta_iter(open(sys.argv[1]), parse_description=False)):
        if n % 10000 == 0:
            print '...', n

        name = record['name'].split()[0]
        name = name.split('/', 1)[0]

        if name == last_name:
            if 1:
                pid_1 = ht.get_partition_id(last_record['sequence'][:K])
                pid_2 = ht.get_partition_id(record['sequence'][:K])

                ht.join_partitions(pid_1, pid_2)
            else:  # TEST
                pid_1 = get_partition(last_record)
                pid_2 = get_partition(record)
                assert pid_1 == pid_2, (last_record, record, pid_1, pid_2)

        last_name = name
        last_record = record

    ht.output_partitions(sys.argv[1], sys.argv[1] + '.paired')
    print 'before:', before
    after = ht.count_partitions()
    print 'after:', after

    n_combined = before[0] - after[0]
    print 'combined:', n_combined
Example #52
0
K = 32
HASHTABLE_SIZE = int(1e9)
N_HT = 4

infile = sys.argv[1]
outfile = sys.argv[2]
outfp = open(outfile, 'w')

print 'making hashtable'
ht = khmer.new_counting_hash(K, HASHTABLE_SIZE, N_HT)

print 'eating', infile
ht.consume_fasta(infile)

print 'counting'
for n, record in enumerate(fasta_iter(open(infile))):
    if n % 10000 == 0:
        print >> sys.stderr, '...', n

    seq = record['sequence']
    if len(seq) < K:
        continue

    x = []
    for pos in range(0, len(seq) - K + 1):
        x.append(ht.get(seq[pos:pos + K]))

    print >> outfp, '>%s\n%s' % (record['name'], record['sequence'])
    print >> outfp, " ".join(map(str, x))

    median, average, stddev = ht.get_median_count(seq)
Example #53
0
#! /usr/bin/env python
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2013. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt. Contact: [email protected]
#
import sys
from screed.fasta import fasta_iter
import re

outfp = open(sys.argv[2], 'w')

for n, record in enumerate(fasta_iter(open(sys.argv[1]))):
    if n % 100000 == 0:
        print >> sys.stderr, '...', n

    if 'N' in record['sequence']:
        splitseq = re.split('N+', record.sequence)
        for i in range(len(splitseq)):
            print >> outfp, '>%s.%d\n%s' % (record.name, i + 1, splitseq[i])

    else:
        print >> outfp, '>%s\n%s' % (record['name'], record['sequence'])
def main():
    parser = argparse.ArgumentParser(
        description='Use bloom filter to count intersection k-mers')

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
    env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
    env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_HASHSIZE)

    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')
    parser.add_argument('--ksize',
                        '-k',
                        type=int,
                        dest='ksize',
                        default=env_ksize,
                        help='k-mer size to use')
    parser.add_argument('--n_hashes',
                        '-N',
                        type=int,
                        dest='n_hashes',
                        default=env_n_hashes,
                        help='number of hash tables to use')
    parser.add_argument('--hashsize',
                        '-x',
                        type=float,
                        dest='hashsize',
                        default=env_hashsize,
                        help='hashsize to use')
    parser.add_argument('first_filename')
    parser.add_argument('second_filename')
    parser.add_argument('report_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.hashsize == DEFAULT_HASHSIZE:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.hashsize
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x hashsize / 8)' % (
            args.n_hashes * args.hashsize / 8.)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.hashsize
    N_HT = args.n_hashes
    filename = args.first_filename
    filename2 = args.second_filename
    file_result = args.report_filename

    file_result_object = open(file_result, 'w')

    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht.get(kmer)):
                n_unique += 1
            ht.count(kmer)
    print filename, 'has been consumed.'
    print '# of unique kmers:', n_unique
    print '# of occupied bin:', ht.n_occupied()
    printout = filename + ":" + '\n'
    printout = printout + '# of unique kmers:' + str(n_unique) + '\n'
    printout = printout + '# of occupied bin:' + str(ht.n_occupied()) + '\n'

    ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
    n_unique = 0
    n_overlap = 0
    for n, record in enumerate(fasta_iter(open(filename2))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht2.get(kmer)):
                n_unique += 1
                if (ht.get(kmer)):
                    n_overlap += 1
            ht2.count(kmer)

    print filename2, 'has been consumed.'
    print '# of unique kmers:', n_unique
    print '# of occupied bin:', ht2.n_occupied()

    print n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2

    printout = printout + filename2 + ":" + '\n'
    printout = printout + '# of unique kmers:' + str(n_unique) + '\n'
    printout = printout + '# of occupied bin:' + str(ht2.n_occupied()) + '\n'
    printout = printout + '# of overlap unique kmers:' + str(n_overlap) + '\n'

    file_result_object.write(printout)
Example #55
0
filename = sys.argv[1]

outname1 = sys.argv[2]
outname2 = sys.argv[3]
num = int(sys.argv[4]) # percentage to pick randomly


f1 = open(outname1,'w')
f2 = open(outname2,'w')


list_seq=[]
list_name=[]

for n, record in enumerate(fasta_iter(open(filename))):
    sequence = record['sequence']
    name = record['name']
    list_seq.append(sequence)
    list_name.append(name)
    if len(list_seq)==100:
        #print num
      
        r=random.sample(xrange(100),num)
        #print r 
        for i in r[:num/2]:
            f1.write('>' +list_name[i]+'\n')
            f1.write(list_seq[i]+'\n')
       # print num/2
        for j in r[num/2:]:
            #print j
def read_partition_file(fp):
    for n, record in enumerate(fasta_iter(fp, parse_description=False)):
        name = record['name']
        name, partition_id = name.rsplit('\t', 1)
        yield n, name, int(partition_id), record['sequence']
Example #57
0
def load_fa_seq_names(filename):
    fp = open(filename)
    records = list(fasta_iter(fp))
    names = [r['name'] for r in records]
    return names
Example #58
0
#! /usr/bin/env python
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2013. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt. Contact: [email protected]
#
import sys
from screed.fasta import fasta_iter

n = 0
for filename in sys.argv[1:]:
    sys.stderr.write('... %s %d\n' % (filename, n))
    idx = filename.find('group')
    assert idx != -1, filename

    group_num = int(filename[idx + 5:].split('.')[0])

    for record in fasta_iter(open(filename + '/contigs.fa')):
        print n, group_num, len(record['sequence'])
        n += 1