def run_no_curve(K,HT_SIZE,N_HT,filename,filename2,file_result): file_result_object = open(file_result,'w') ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht.get(kmer)): n_unique+=1 ht.count(kmer) print filename,'has been consumed.' print '# of unique kmers:',n_unique print '# of occupied bin:',ht.n_occupied() printout = filename+":"+'\n' printout = printout+'# of unique kmers:'+str(n_unique)+'\n' printout = printout + '# of occupied bin:'+str(ht.n_occupied())+'\n' ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 for n, record in enumerate(fasta_iter(open(filename2))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht2.get(kmer)): n_unique+=1 if (ht.get(kmer)): n_overlap+=1 ht2.count(kmer) print filename2,'has been consumed.' print '# of unique kmers:',n_unique print '# of occupied bin:',ht2.n_occupied() print n_overlap,'unique kmers appears in both ',filename,' and ',filename2 printout = printout+filename2+":"+'\n' printout = printout+'# of unique kmers:'+str(n_unique)+'\n' printout = printout + '# of occupied bin:'+str(ht2.n_occupied())+'\n' printout = printout + '# of overlap unique kmers:' + str(n_overlap) + '\n' file_result_object.write(printout)
def test_bloom_python_1(): # test python code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht2 = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht2.get(kmer)): n_unique += 1 ht2.count(kmer) assert n_unique == 3960 assert ht2.n_occupied() == 3885, ht2.n_occupied() # this number equals n_unique assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers()
def test_hll_add_python(): # test python code to count unique kmers using HyperLogLog. # use the lower level add() method, which accepts anything, # and compare to an exact count using collections.Counter filename = utils.get_test_data('random-20-a.fa') hllcpp = khmer.HLLCounter(ERR_RATE, K) counter = set() for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] rc = "".join(TRANSLATE[c] for c in kmer[::-1]) hllcpp.add(kmer) if rc in counter: kmer = rc counter.update([kmer]) n_unique = len(counter) assert n_unique == N_UNIQUE assert abs(1 - float(hllcpp.estimate_cardinality()) / N_UNIQUE) < ERR_RATE
def test_bloom_python_1(): # test python code to count unique kmers using bloom filter filename = utils.get_test_data("random-20-a.fa") K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht2 = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n : n + K] if not ht2.get(kmer): n_unique += 1 ht2.count(kmer) assert n_unique == 3960 assert ht2.n_occupied() == 3885, ht2.n_occupied() # this number equals n_unique assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers()
def test_bloom_python_1(): # test python code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') ksize = 20 # size of kmer htable_size = 100000 # size of hashtableable num_htableables = 3 # number of hashtableables htableable = khmer.Hashbits(ksize, htable_size, num_htableables) n_unique = 0 for _, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - ksize): kmer = sequence[n:n + ksize] if not htableable.get(kmer): n_unique += 1 htableable.count(kmer) assert n_unique == 3960 assert htableable.n_occupied() == 3885, htableable.n_occupied() # this number equals n_unique assert htableable.n_unique_kmers() == 3960, htableable.n_unique_kmers()
def verbose_fasta_iter(filename): from screed.fasta import fasta_iter it = fasta_iter(open(filename)) for n, record in enumerate(it): if n % 10000 == 0: print >>sys.stderr, '... filtering', n yield record
def main(): outfp = open(sys.argv[2], 'w') ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) ht.consume_fasta(sys.argv[1]) hist = [0] * 200 histcount = [0] * 200 for n, record in enumerate(fasta_iter(open(sys.argv[1]))): if n % 10000 == 0: print '...', n seq = record['sequence'] for pos in range(0, len(seq) - K + 1): kmer = seq[pos:pos + K] count = ht.kmer_degree(kmer) hist[pos] += count histcount[pos] += 1 for i in range(len(hist)): total = hist[i] count = histcount[i] if not count: continue print >> outfp, i, total, count, total / float(count)
def assemble_sequences(f, k, length_cutoff=LENGTH_CUTOFF): try: seqfile = f #dirname = os.path.dirname(os.path.abspath(f)) dirname = tempfile.mkdtemp() assemble_dir = os.path.join(dirname, 'assemble') p = subprocess.Popen('velveth %s %d -shortPaired %s.pe -short %s.se' % (assemble_dir, k, seqfile, seqfile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0, (stdout, stderr) p = subprocess.Popen('velvetg %s -read_trkg yes -exp_cov auto -cov_cutoff 0' % (assemble_dir,), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0, (stdout, stderr) x = [] total = 0 for r in fasta_iter(open(os.path.join(assemble_dir, 'contigs.fa'))): seqlen = len(r['sequence']) if seqlen >= length_cutoff: x.append(r) total += seqlen return total, x finally: pass shutil.rmtree(dirname)
def assemble_sequences(f, k, length_cutoff=LENGTH_CUTOFF): try: seqfile = f #dirname = os.path.dirname(os.path.abspath(f)) dirname = tempfile.mkdtemp() assemble_dir = os.path.join(dirname, 'assemble') p = subprocess.Popen('bash %s %s %d %s' % (SGA_PIPE, seqfile, k, assemble_dir), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) print 'bash %s %s %d %s' % (SGA_PIPE, seqfile, k, assemble_dir) (stdout, stderr) = p.communicate() assert p.returncode == 0, (stdout, stderr) x = [] total = 0 print os.listdir(assemble_dir) for r in fasta_iter(open(os.path.join(assemble_dir, '%s.sga.%d-contigs.fa' %(os.path.basename(f), k)))): seqlen = len(r['sequence']) if seqlen >= length_cutoff: x.append(r) total += seqlen return total, x finally: pass shutil.rmtree(dirname)
def run_no_curve(K, HT_SIZE, N_HT, filename, filename2, file_result): file_result_object = open(file_result, 'w') ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht.get(kmer)): n_unique += 1 ht.count(kmer) print filename, 'has been consumed.' print '# of unique kmers:', n_unique print '# of occupied bin:', ht.n_occupied() printout = filename + ":" + '\n' printout = printout + '# of unique kmers:' + str(n_unique) + '\n' printout = printout + '# of occupied bin:' + str(ht.n_occupied()) + '\n' ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 for n, record in enumerate(fasta_iter(open(filename2))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht2.get(kmer)): n_unique += 1 if (ht.get(kmer)): n_overlap += 1 ht2.count(kmer) print filename2, 'has been consumed.' print '# of unique kmers:', n_unique print '# of occupied bin:', ht2.n_occupied() print n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2 printout = printout + filename2 + ":" + '\n' printout = printout + '# of unique kmers:' + str(n_unique) + '\n' printout = printout + '# of occupied bin:' + str(ht2.n_occupied()) + '\n' printout = printout + '# of overlap unique kmers:' + str(n_overlap) + '\n' file_result_object.write(printout)
def count_sum_contigs(cutoff, filename): total = 0 for record in fasta_iter(open(filename)): seqlen = len(record['sequence']) if seqlen >= cutoff: total += seqlen return total
def main(): ''' Usage: python <thisfile> <infile> length numseq2keep tag <outfile> ''' if len(sys.argv) != 6: mes = ('Usage: python {} <infile> length numseq2keep tag <outfile>\n' '*** tag can be used screen OUT seq names\n') print >> sys.stderr, mes.format(os.path.basename(sys.argv[0])) sys.exit(1) infile = sys.argv[1] length = int(sys.argv[2]) num = int(sys.argv[3]) tag = sys.argv[4] outfile = sys.argv[5] try: if infile == '-': fp = sys.stdin else: fp = open(infile) if outfile == '-': fw = sys.stdout else: fw = open(outfile, 'wb') for n, record in enumerate(fasta.fasta_iter(fp)): if n == num: break name = record['name'] seq = record['sequence'] if len(seq) < length: continue if tag in name: continue new_seq = seq[:length] fw.write('>{}\n{}\n'.format(name, new_seq)) #fasta output try: n except NameError: print >> sys.stderr, '*** No seqs are in seqfile' if n < num: mes = '*** Not enough seqs in {} ({} < {}), only {} subsampled' print >> sys.stderr, mes.format(os.path.basename(infile), n, num, n) except IOError as err: print >> sys.stderr, err fw.close() sys.exit(1)
def test_hll_consume_string(): # test c++ code to count unique kmers using HyperLogLog, # using screed to feed each read to the counter. filename = utils.get_test_data('random-20-a.fa') hllcpp = khmer.HLLCounter(ERR_RATE, K) for n, record in enumerate(fasta_iter(open(filename))): hllcpp.consume_string(record['sequence']) assert abs(1 - float(hllcpp.estimate_cardinality()) / N_UNIQUE) < ERR_RATE
def load_fa_seq_names(filename): try: fasta_iter except NameError: raise nose.SkipTest fp = open(filename) records = list(fasta_iter(fp)) names = [ r['name'] for r in records ] return names
def main(): d = dict([(r['name'], r['sequence']) for r in fasta_iter(open(sys.argv[1]))]) ks = d.keys() random.shuffle(ks) for k in ks: s = d[k] print '>%s\n%s' % (k, s)
def main(): filename = sys.argv[1] K = int(sys.argv[2]) # size of kmer HT_SIZE = int(sys.argv[3]) # size of hashtable N_HT = int(sys.argv[4]) # number of hashtables ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht.get(kmer)): n_unique += 1 ht.count(kmer) print(filename, 'has been consumed.') print('# of unique kmers:', n_unique) print('# of occupied bin:', ht.n_occupied()) filename2 = sys.argv[5] ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 for n, record in enumerate(fasta_iter(open(filename2))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht2.get(kmer)): n_unique += 1 if (ht.get(kmer)): n_overlap += 1 ht2.count(kmer) print(filename2, 'has been consumed.') print('# of unique kmers:', n_unique) print('# of occupied bin:', ht2.n_occupied()) print(n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2)
def main(): filename = sys.argv[1] K = int(sys.argv[2]) # size of kmer HT_SIZE = int(sys.argv[3]) # size of hashtable N_HT = int(sys.argv[4]) # number of hashtables ht = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n : n + K] if not ht.get(kmer): n_unique += 1 ht.count(kmer) print(filename, "has been consumed.") print("# of unique kmers:", n_unique) print("# of occupied bin:", ht.n_occupied()) filename2 = sys.argv[5] ht2 = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 for n, record in enumerate(fasta_iter(open(filename2))): sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n : n + K] if not ht2.get(kmer): n_unique += 1 if ht.get(kmer): n_overlap += 1 ht2.count(kmer) print(filename2, "has been consumed.") print("# of unique kmers:", n_unique) print("# of occupied bin:", ht2.n_occupied()) print(n_overlap, "unique kmers appears in both ", filename, " and ", filename2)
def main(): filename = sys.argv[1] K = int(sys.argv[2]) # size of kmer HT_SIZE = int(sys.argv[3]) # size of hashtable N_HT = int(sys.argv[4]) # number of hashtables ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht.get(kmer)): n_unique += 1 ht.count(kmer) print filename, 'has been consumed.' print '# of unique kmers:', n_unique print '# of occupied bin:', ht.n_occupied() filename2 = sys.argv[5] ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 for n, record in enumerate(fasta_iter(open(filename2))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht2.get(kmer)): n_unique += 1 if (ht.get(kmer)): n_overlap += 1 ht2.count(kmer) print filename2, 'has been consumed.' print '# of unique kmers:', n_unique print '# of occupied bin:', ht2.n_occupied() print n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2
def cutMSA(fp, start, end): """ Cut a region in Multiple Sequence Alignment based on the start and end positions on template Parameters: ----------- fp : file object file object of aligned sequence file (.afa) start : int start position of target region end : int end position of end region Returns: -------- tuple a tuple with tow items. First item is target region in tempalte and second item is a dictionary with sequence name as key and target region of that sequence as value """ fw = open('%s.%sto%s.cut' % (sys.argv[1], start, end), 'w') refName, template, profile = getRef(fp, 1) length = len(profile) for i in range(length): if profile[i] == 0: continue j = sum(profile[:i + 1]) if j == int(start): start1 = i if j == int(end): end1 = i break print >> fw, '>%s\n%s' % (refName, template[start1:(end1 + 1)]) rows = {} #ref seq not included fp.seek(0, 0) for record in fasta.fasta_iter(fp): name = record['name'] if 'ReFeReNcE' in name: continue seq = record['sequence'] assert len(seq) == length, 'not afa format' subSeq = seq[start1:(end1 + 1)] rows[name] = subSeq print >> fw, '>%s\n%s' % (name, subSeq) return template[start1:(end1 + 1)], rows
def process_file(filename,HT_SIZE_array): N_HT = 4 K = 12 list_average_miscount = [] list_average_miscount_perc = [] list_fp_miscount0 = [] print filename for HT_SIZE in HT_SIZE_array: print HT_SIZE ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.consume_fasta(filename) ktable = khmer.new_ktable(K) for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] # ktable.consume(sequence) seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] ktable.count(kmer) list_miscount = [] list_miscount_perc = [] total_kmer = 0 # total number of unique k-mers miscount0 = 0 for i in range(0, ktable.n_entries()): n = ktable.get(i) if n: total_kmer = total_kmer + 1 kmer2 = ktable.reverse_hash(i) miscount = ht.get(kmer2) - ktable.get(kmer2)###### miscount_perc = miscount/ktable.get(kmer2) list_miscount.append(miscount) list_miscount_perc.append(miscount_perc) if miscount > 0: miscount0 = miscount0 + 1 average_miscount = float(sum(list_miscount))/len(list_miscount) list_average_miscount.append(average_miscount) average_miscount_perc = float(sum(list_miscount_perc))/len(list_miscount_perc) list_average_miscount_perc.append(average_miscount_perc) fp_miscount0 = float(miscount0)/total_kmer list_fp_miscount0.append(fp_miscount0) to_return = [list_average_miscount,list_fp_miscount0,total_kmer,list_average_miscount_perc] return to_return
def cutMSA(fp, start, end): """ Cut a region in Multiple Sequence Alignment based on the start and end positions on template Parameters: ----------- fp : file object file object of aligned sequence file (.afa) start : int start position of target region end : int end position of end region Returns: -------- tuple a tuple with tow items. First item is target region in tempalte and second item is a dictionary with sequence name as key and target region of that sequence as value """ fw = open("%s.%sto%s.cut" % (sys.argv[1], start, end), "w") refName, template, profile = getRef(fp, 1) length = len(profile) for i in range(length): if profile[i] == 0: continue j = sum(profile[: i + 1]) if j == int(start): start1 = i if j == int(end): end1 = i break print >> fw, ">%s\n%s" % (refName, template[start1 : (end1 + 1)]) rows = {} # ref seq not included fp.seek(0, 0) for record in fasta.fasta_iter(fp): name = record["name"] if "ReFeReNcE" in name: continue seq = record["sequence"] assert len(seq) == length, "not afa format" subSeq = seq[start1 : (end1 + 1)] rows[name] = subSeq print >> fw, ">%s\n%s" % (name, subSeq) return template[start1 : (end1 + 1)], rows
def test_filter_if_present(): ht = khmer.LabelHash(32, 1e6, 2) maskfile = utils.get_test_data('filter-test-A.fa') inputfile = utils.get_test_data('filter-test-B.fa') outfile = utils.get_temp_filename('filter') ht.consume_fasta(maskfile) ht.filter_if_present(inputfile, outfile) records = list(fasta_iter(open(outfile))) assert len(records) == 1 assert records[0]['name'] == '3'
def main(contig_filename, read_filenames_list, output_filename): ht = khmer.new_counting_hash(K, HASHTABLE_SIZE, N_HT) '''consumes contig into hashtable''' for n, record in enumerate(fasta_iter(open(contig_filename))): sequence = record['sequence'] contig_kmers = slidingWindow(sequence, K) for x in contig_kmers: if x.find('N') > 0: continue else: ht.consume(x) '''counts reads into hashtable abundance''' for each_file in read_filenames_list: read_file = open(each_file, 'r') for n1, record1 in enumerate(fasta_iter(read_file)): sequence = record1['sequence'] read_kmers = slidingWindow(sequence, K) for kmer in read_kmers: if ht.get(kmer) > 0: ht.count(kmer) read_file.close() '''retrieve abundances''' for n2, record2 in enumerate(fasta_iter(open(contig_filename))): contig_seq = record2['sequence'] count_list = [] contig_kmers = slidingWindow(contig_seq, K) for contig_kmer in contig_kmers: count_kmer = int(ht.get(contig_kmer)) - 1 count_list.append(count_kmer) fp = open(output_filename, 'w') for item in count_list: print >>fp, '%s' % item print 'Hashtable occupancy =', ht.n_occupied() / float(HASHTABLE_SIZE)
def test_filter_if_present(): ht = khmer.new_hashbits(32, 1e6, 2) maskfile = os.path.join(thisdir, "test-data", "filter-test-A.fa") inputfile = os.path.join(thisdir, "test-data", "filter-test-B.fa") outfile = os.path.join(thisdir, "test-data", "filter-test-C.fa") ht.consume_fasta(maskfile) ht.filter_if_present(inputfile, outfile) records = list(fasta_iter(open(outfile))) assert len(records) == 1 assert records[0]["name"] == "3"
def test_filter_if_present(): ht = khmer.Hashbits(32, 1e6, 2) maskfile = utils.get_test_data("filter-test-A.fa") inputfile = utils.get_test_data("filter-test-B.fa") outfile = utils.get_temp_filename("filter") ht.consume_fasta(maskfile) ht.filter_if_present(inputfile, outfile) records = list(fasta_iter(open(outfile))) assert len(records) == 1 assert records[0]["name"] == "3"
def test_filter_if_present(): ht = khmer.Hashbits(32, 1e6, 2) maskfile = utils.get_test_data('filter-test-A.fa') inputfile = utils.get_test_data('filter-test-B.fa') outfile = utils.get_temp_filename('filter') ht.consume_fasta(maskfile) ht.filter_if_present(inputfile, outfile) records = list(fasta_iter(open(outfile))) assert len(records) == 1 assert records[0]['name'] == '3'
def load_sequences(filename): d = {} records = list(fasta_iter(open(filename), parse_description=False)) for r in records: name = r['name'] partition = name.rsplit('\t', 1)[1] partition = int(partition) x = d.get(partition, []) x.append(r) d[partition] = x return len(records), d
def getRef(fp, n_ref): """ Get template sequence from .afa file and a gap profile of the aligned sequences (1 is a real base and 0 is a gap) Parameters: ----------- fp : file object file object of aligned sequence file (.afa) n_ref : int number of template sequence to collect Returns: -------- str name of the first template sequence str aligned sequence list a gap profile of aligned sequence """ refs = {} reads = {} cnt = 0 for record in fasta.fasta_iter(fp): name = record['name'] seq = record['sequence'] if 'ReFeReNcE' in name: refs[name] = seq cnt += 1 if cnt >= n_ref: break if cnt < n_ref: print 'not enough ReFeReNcE seqs' sys.exit(1) template = refs.values()[0].upper() #use the first refSeq as template profile = [] length = len(template) for i in range(length): if template[i] == 'N' or not template[i].isalpha(): profile.append(0) else: profile.append(1) return name, template, profile #return the ref seq and its mask profile
def getRef(fp, n_ref): """ Get template sequence from .afa file and a gap profile of the aligned sequences (1 is a real base and 0 is a gap) Parameters: ----------- fp : file object file object of aligned sequence file (.afa) n_ref : int number of template sequence to collect Returns: -------- str name of the first template sequence str aligned sequence list a gap profile of aligned sequence """ refs = {} reads = {} cnt = 0 for record in fasta.fasta_iter(fp): name = record["name"] seq = record["sequence"] if "ReFeReNcE" in name: refs[name] = seq cnt += 1 if cnt >= n_ref: break if cnt < n_ref: print "not enough ReFeReNcE seqs" sys.exit(1) template = refs.values()[0].upper() # use the first refSeq as template profile = [] length = len(template) for i in range(length): if template[i] == "N" or not template[i].isalpha(): profile.append(0) else: profile.append(1) return name, template, profile # return the ref seq and its mask profile
def test_n_occupied_1(): filename = os.path.join(thisdir, "test-data/random-20-a.fa") K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 1 # number of hashtables ### test modified c++ n_occupied code ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht1.consume(record["sequence"]) # this number calculated independently assert ht1.n_occupied() == 3877
def test_n_occupied_1(): filename = utils.get_test_data('random-20-a.fa') ksize = 20 # size of kmer htable_size = 100000 # size of hashtableable num_htableables = 1 # number of hashtableables # test modified c++ n_occupied code htableable = khmer.Hashbits(ksize, htable_size, num_htableables) for _, record in enumerate(fasta_iter(open(filename))): htableable.consume(record['sequence']) # this number calculated independently assert htableable.n_occupied() == 3884, htableable.n_occupied()
def test_n_occupied_1(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 1 # number of hashtables # test modified c++ n_occupied code ht1 = khmer.LabelHash(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht1.consume(record['sequence']) # this number calculated independently assert ht1.n_occupied() == 3877
def test_n_occupied_1(): filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 1 # number of hashtables # test modified c++ n_occupied code ht1 = khmer.Hashbits(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht1.consume(record['sequence']) # this number calculated independently assert ht1.n_occupied() == 3877
def assemble_sequences(records, k, length_cutoff=1000): dirname = tempfile.mkdtemp() os.chdir(dirname) try: seqfile = os.path.join(dirname, 'seqs.fa') fp = open(seqfile, 'w') for r in records: fp.write('>%s\n%s\n' % (r['name'].split()[0], r['sequence'])) fp.close() p = subprocess.Popen([ 'python', '/root/khmer/scripts/strip-and-split-for-assembly.py', 'seqs.fa seqs.fa' ], shell=True) p.communicate() assert p.returncode == 0 assemble_dir = os.path.join(dirname, 'assemble') p = subprocess.Popen('velveth %s %d -shortPaired %s.pe -short %s.se' % (assemble_dir, k, seqfile, seqfile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0, (stdout, stderr) p = subprocess.Popen( 'velvetg %s -read_trkg yes -exp_cov auto -cov_cutoff 0' % (assemble_dir, ), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0, (stdout, stderr) x = [] total = 0 for r in fasta_iter(open(os.path.join(assemble_dir, 'contigs.fa'))): seqlen = len(r['sequence']) if seqlen >= length_cutoff: x.append(r) total += seqlen return total, x finally: shutil.rmtree(dirname)
def test_bloom_c_1(): ### test c++ code to count unique kmers using bloom filter filename = os.path.join(thisdir, "test-data/random-20-a.fa") K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht3 = khmer.new_hashbits(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht3.consume(record["sequence"]) assert ht3.n_occupied() == 3882 assert ht3.n_unique_kmers() == 3960
def test_bloom_c_1(): # test c++ code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht3 = khmer.Hashbits(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht3.consume(record['sequence']) assert ht3.n_occupied() == 3882 assert ht3.n_unique_kmers() == 3960
def test_bloom_c_1(): # test c++ code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') ksize = 20 # size of kmer htable_size = 100000 # size of hashtableable num_htableables = 3 # number of hashtableables htableable = khmer.Hashbits(ksize, htable_size, num_htableables) for _, record in enumerate(fasta_iter(open(filename))): htableable.consume(record['sequence']) assert htableable.n_occupied() == 3885 assert htableable.n_unique_kmers() == 3960
def test_bloom_c_1(): # test c++ code to count unique kmers using bloom filter filename = utils.get_test_data('random-20-a.fa') K = 20 # size of kmer HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables ht3 = khmer.LabelHash(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht3.consume(record['sequence']) assert ht3.n_occupied() == 3882 assert ht3.n_unique_kmers() == 3960
def main(): ''' Usage: python <thisfile> <infile> <outfile> ''' if len(sys.argv) != 3: mes = ('Usage: python {} <infile> <outfile>') print >> sys.stderr, mes.format(os.path.basename(sys.argv[0])) sys.exit(1) infile = sys.argv[1] outfile = sys.argv[2] try: if infile == '-': fp = sys.stdin else: fp = open(infile) if outfile == '-': fw = sys.stdout else: fw = open(outfile, 'wb') lowcomp_fw = open('low_complexity.fa', 'wb') for n, record in enumerate(fasta.fasta_iter(fp)): name = record['name'] seq = record['sequence'] uniq_kmer_count = count_uniq_kmer(seq) if uniq_kmer_count * 1.0/(len(seq) - K + 1) < CUTOFF: lowcomp_fw.write('>{}\n{}\n'.format(name, seq)) #fasta output continue fw.write('>{}\n{}\n'.format(name, seq)) #fasta output try: n except NameError: print >> sys.stderr, '*** No seqs are in seqfile' except IOError as err: if outfile == '-': pass else: print >> sys.stderr, '*** {}'.format(err) sys.exit(1)
def main(): ''' Usage: python <thisfile> <infile> <outfile> ''' if len(sys.argv) != 3: mes = ('Usage: python {} <infile> <outfile>') print >> sys.stderr, mes.format(os.path.basename(sys.argv[0])) sys.exit(1) infile = sys.argv[1] outfile = sys.argv[2] try: if infile == '-': fp = sys.stdin else: fp = open(infile) if outfile == '-': fw = sys.stdout else: fw = open(outfile, 'wb') lowcomp_fw = open('low_complexity.fa', 'wb') for n, record in enumerate(fasta.fasta_iter(fp)): name = record['name'] seq = record['sequence'] uniq_kmer_count = count_uniq_kmer(seq) if uniq_kmer_count * 1.0 / (len(seq) - K + 1) < CUTOFF: lowcomp_fw.write('>{}\n{}\n'.format(name, seq)) #fasta output continue fw.write('>{}\n{}\n'.format(name, seq)) #fasta output try: n except NameError: print >> sys.stderr, '*** No seqs are in seqfile' except IOError as err: if outfile == '-': pass else: print >> sys.stderr, '*** {}'.format(err) sys.exit(1)
def assemble_sequences(records, k, length_cutoff=1000): dirname = tempfile.mkdtemp() os.chdir(dirname) try: seqfile = os.path.join(dirname, 'seqs.fa') fp = open(seqfile, 'w') for r in records: fp.write('>%s\n%s\n' % (r['name'].split()[0], r['sequence'])) fp.close() p = subprocess.Popen( ['python', '/root/khmer/scripts/strip-and-split-for-assembly.py', 'seqs.fa seqs.fa'], shell=True) p.communicate() assert p.returncode == 0 assemble_dir = os.path.join(dirname, 'assemble') p = subprocess.Popen( 'velveth %s %d -shortPaired %s.pe -short %s.se' % ( assemble_dir, k, seqfile, seqfile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0, (stdout, stderr) p = subprocess.Popen( 'velvetg %s -read_trkg yes -exp_cov auto -cov_cutoff 0' % ( assemble_dir,), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = p.communicate() assert p.returncode == 0, (stdout, stderr) x = [] total = 0 for r in fasta_iter(open(os.path.join(assemble_dir, 'contigs.fa'))): seqlen = len(r['sequence']) if seqlen >= length_cutoff: x.append(r) total += seqlen return total, x finally: shutil.rmtree(dirname)
def main(): filename = sys.argv[1] K = int(sys.argv[2]) # size of kmer HT_SIZE = int(sys.argv[3]) # size of hashtable N_HT = int(sys.argv[4]) # number of hashtables ht = khmer.Nodegraph(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht.get(kmer)): n_unique += 1 ht.count(kmer) print(n_unique) print(ht.n_occupied()) print(ht.n_unique_kmers())
def main(): ht = khmer.new_hashbits(K, 1, 1) ht.consume_partitioned_fasta(sys.argv[1]) before = ht.count_partitions() last_name = None last_record = None for n, record in enumerate( fasta_iter(open(sys.argv[1]), parse_description=False)): if n % 10000 == 0: print '...', n name = record['name'].split()[0] name = name.split('/', 1)[0] if name == last_name: if 1: pid_1 = ht.get_partition_id(last_record['sequence'][:K]) pid_2 = ht.get_partition_id(record['sequence'][:K]) ht.join_partitions(pid_1, pid_2) else: # TEST pid_1 = get_partition(last_record) pid_2 = get_partition(record) assert pid_1 == pid_2, (last_record, record, pid_1, pid_2) last_name = name last_record = record ht.output_partitions(sys.argv[1], sys.argv[1] + '.paired') print 'before:', before after = ht.count_partitions() print 'after:', after n_combined = before[0] - after[0] print 'combined:', n_combined
K = 32 HASHTABLE_SIZE = int(1e9) N_HT = 4 infile = sys.argv[1] outfile = sys.argv[2] outfp = open(outfile, 'w') print 'making hashtable' ht = khmer.new_counting_hash(K, HASHTABLE_SIZE, N_HT) print 'eating', infile ht.consume_fasta(infile) print 'counting' for n, record in enumerate(fasta_iter(open(infile))): if n % 10000 == 0: print >> sys.stderr, '...', n seq = record['sequence'] if len(seq) < K: continue x = [] for pos in range(0, len(seq) - K + 1): x.append(ht.get(seq[pos:pos + K])) print >> outfp, '>%s\n%s' % (record['name'], record['sequence']) print >> outfp, " ".join(map(str, x)) median, average, stddev = ht.get_median_count(seq)
#! /usr/bin/env python # # This file is part of khmer, http://github.com/ged-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2013. It is licensed under # the three-clause BSD license; see doc/LICENSE.txt. Contact: [email protected] # import sys from screed.fasta import fasta_iter import re outfp = open(sys.argv[2], 'w') for n, record in enumerate(fasta_iter(open(sys.argv[1]))): if n % 100000 == 0: print >> sys.stderr, '...', n if 'N' in record['sequence']: splitseq = re.split('N+', record.sequence) for i in range(len(splitseq)): print >> outfp, '>%s.%d\n%s' % (record.name, i + 1, splitseq[i]) else: print >> outfp, '>%s\n%s' % (record['name'], record['sequence'])
def main(): parser = argparse.ArgumentParser( description='Use bloom filter to count intersection k-mers') env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT) env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_HASHSIZE) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') parser.add_argument('--ksize', '-k', type=int, dest='ksize', default=env_ksize, help='k-mer size to use') parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes', default=env_n_hashes, help='number of hash tables to use') parser.add_argument('--hashsize', '-x', type=float, dest='hashsize', default=env_hashsize, help='hashsize to use') parser.add_argument('first_filename') parser.add_argument('second_filename') parser.add_argument('report_filename') args = parser.parse_args() if not args.quiet: if args.hashsize == DEFAULT_HASHSIZE: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.hashsize print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x hashsize / 8)' % ( args.n_hashes * args.hashsize / 8.) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.hashsize N_HT = args.n_hashes filename = args.first_filename filename2 = args.second_filename file_result = args.report_filename file_result_object = open(file_result, 'w') ht = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht.get(kmer)): n_unique += 1 ht.count(kmer) print filename, 'has been consumed.' print '# of unique kmers:', n_unique print '# of occupied bin:', ht.n_occupied() printout = filename + ":" + '\n' printout = printout + '# of unique kmers:' + str(n_unique) + '\n' printout = printout + '# of occupied bin:' + str(ht.n_occupied()) + '\n' ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 for n, record in enumerate(fasta_iter(open(filename2))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] if (not ht2.get(kmer)): n_unique += 1 if (ht.get(kmer)): n_overlap += 1 ht2.count(kmer) print filename2, 'has been consumed.' print '# of unique kmers:', n_unique print '# of occupied bin:', ht2.n_occupied() print n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2 printout = printout + filename2 + ":" + '\n' printout = printout + '# of unique kmers:' + str(n_unique) + '\n' printout = printout + '# of occupied bin:' + str(ht2.n_occupied()) + '\n' printout = printout + '# of overlap unique kmers:' + str(n_overlap) + '\n' file_result_object.write(printout)
filename = sys.argv[1] outname1 = sys.argv[2] outname2 = sys.argv[3] num = int(sys.argv[4]) # percentage to pick randomly f1 = open(outname1,'w') f2 = open(outname2,'w') list_seq=[] list_name=[] for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] name = record['name'] list_seq.append(sequence) list_name.append(name) if len(list_seq)==100: #print num r=random.sample(xrange(100),num) #print r for i in r[:num/2]: f1.write('>' +list_name[i]+'\n') f1.write(list_seq[i]+'\n') # print num/2 for j in r[num/2:]: #print j
def read_partition_file(fp): for n, record in enumerate(fasta_iter(fp, parse_description=False)): name = record['name'] name, partition_id = name.rsplit('\t', 1) yield n, name, int(partition_id), record['sequence']
def load_fa_seq_names(filename): fp = open(filename) records = list(fasta_iter(fp)) names = [r['name'] for r in records] return names
#! /usr/bin/env python # # This file is part of khmer, http://github.com/ged-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2013. It is licensed under # the three-clause BSD license; see doc/LICENSE.txt. Contact: [email protected] # import sys from screed.fasta import fasta_iter n = 0 for filename in sys.argv[1:]: sys.stderr.write('... %s %d\n' % (filename, n)) idx = filename.find('group') assert idx != -1, filename group_num = int(filename[idx + 5:].split('.')[0]) for record in fasta_iter(open(filename + '/contigs.fa')): print n, group_num, len(record['sequence']) n += 1