def write_chip_seq_data(chip_seq_data, filename): genome = get_ecoli_genome() with open(filename, 'w') as f: for (i, (base, val)) in verbose_gen(enumerate(zip(genome, chip_seq_data)), 10**5): f.write("%s,%s,%s\n" % (i, base, val))
def chip_ps_const_frag_length_ref(ps,mean_frag_length,cells=10000,verbose=False): G = len(ps) out = np.zeros(G) G_iterator = verbose_gen(xrange(G),modulus=1000) if verbose else xrange(G) for i in G_iterator: ps_i = ps[max(i-mean_frag_length/2,0):min(i+mean_frag_length,G)] out[i] = alo(ps_i) return cells*out
def compute_coefficients(ks): arr = np.array([0 for i in ks] + [0]) arr[-1] = 1 for k in verbose_gen(ks,modulus=10000): #print np.roll(arr,-1), - k*arr arr = np.roll(arr,-1) - k*arr #print arr return arr
def test_inverse_cdf_sampler(): K = int(5*10**6) trials = K ps = [1.0/K for i in xrange(K)] sampler = inverse_cdf_sampler(ps) samples = [sampler() for i in verbose_gen(xrange(trials),modulus=100000)] plt.hist(samples,bins=1000) plt.show()
def test_frags_from_splits(G,mean_frag_length,trials): lamb = 1.0/mean_frag_length for trial in verbose_gen(range(trials)): config = [random.randrange(G) for i in range(100)] splits = make_splits(G,lamb) if set(frags_from_splits(config,splits)) == set(frags_from_splits_ref(config,splits)): continue else: return config,splits
def dummify_genome(genome, w): G = len(genome) mat = np.zeros((G, 4*w)) print mat.shape d = {"A":0, "C":1, "G":2, "T":3} for i in verbose_gen(xrange(G), modulus=1000): for j in range(w): b = genome[(i+j)%G] mat[i, 4*j+d[b]] = 1#int(b != "T") return mat
def chip_ps_const_frag_length_ref2(ps,mean_frag_length,cells=10000,verbose=False): G = len(ps) out = np.zeros(G) cell_iterator = verbose_gen(xrange(cells),modulus=10000) if verbose else xrange(cells) for cell in cell_iterator: i = random.randrange(G-mean_frag_length) left,right = max(i-cutoff,0),min(i+cutoff,G) ps_i = ps[left:right] if random.random() < alo(ps_i): out[left:right] += 1 return out
def max_in_window_ref(scores,k): """Return max in window of radius k over circular array of scores""" G = len(scores) max_scores = np.empty(G) for i in verbose_gen(xrange(G),10000): m = None for j in xrange(-k,k+1): if scores[(i+j) % G] > m: m = scores[(i+j) % G] max_scores[i] = m return max_scores
def chip_seq_log_likelihood_ref(ps,mapped_reads,N): """Given hypothesis ps, a chip-seq dataset in the form of mapped reads, and total number of cells, compute log likelihood-- reference implementation. Note that pi is an hypothesis about the probability that a fragment covers base i, not an hypothesis that base i is occupied. """ def log_dbinom(N,k,p): return log_choose(N,k) + k*log(p) + (N-k)*log(1-p) return sum([log_dbinom(N,m,p) for m,p in verbose_gen(zip(mapped_reads,ps),modulus=1000)])
def exp_reconstruction(reads,lamb,G): """Reconstruct fragment density map by assuming exponential extension of each read""" frag_map = [0]*G mfl = int(1/lamb) for (strand,start,stop) in verbose_gen(reads,modulus=10000): assert(stop - start == 75) for i in range(start,stop): frag_map[i] += 1 ext_list = xrange(stop,stop+10*mfl,+1) if strand == "+" else xrange(start-10*mfl,start,+1) endpoint = stop if strand == "+" else start for i in ext_list: frag_map[i%G] += (1-lamb)**abs(i-endpoint) return frag_map
def predict_chip_ps5(ps,mean_frag_length,cells=100): G = len(ps) lamb = 1.0/mean_frag_length cutoff = min(5*mean_frag_length,G) # ignore contributions outside 5 times expected fragment length ks = range(-cutoff,cutoff) def left(i): return sum(ps[j]*product(1-ps[k] for k in range(j+1,i+1))*(1-lamb)**abs(j-i) for j in range(i) if abs(j-i) < cutoff) def right(i): return sum(ps[j]*product(1-ps[k] for k in range(i,j))*(1-lamb)**abs(j-i) for j in range(i,G) if abs(j-i) < cutoff) # return [cells*sum(ps[j]*product(1-ps[k] for k in range(i,j,mysign(j-i)))*(1-lamb)**abs(j-i) # for j in range(G) if abs(j-i) < cutoff) # for i in range(G)] return [cells*(1-(1-left(i))*(1-right(i))) for i in verbose_gen(range(G))]
def esp_spec(ps,k,powsums=None): #print "calling esp(ps,%s)" % k if k == 0: return 1 if powsums is None: print "computing powersums..." powsums = [powsum(ps,i) for i in verbose_gen(range(k+1))] print "finished with powersums" esp_array = [None]*(k+1) esp_array[0] = 1 for cur_k in range(1,k+1): ans = sum((-1)**(i-1)*esp_array[cur_k-i]*powsums[i] for i in range(1,cur_k+1))/float(cur_k) esp_array[cur_k] = ans #print esp_array return esp_array[k]
def ising(hs, J, iterations=50000, boundary="periodic", spins=None, burn_in=0): N = len(hs) if spins is None: spins = np.array([random.choice([-1, 1]) for i in range(N)]) occupancies = np.zeros(N) for t in verbose_gen(xrange(iterations), modulus=1000): for i in range(N): current_energy = spins[i] * (hs[i] + J * (spins[(i - 1) % N] + spins[(i + 1) % N])) prop_energy = -current_energy p_prop = exp(-prop_energy) / (exp(-current_energy) + exp(-prop_energy)) # print "p_prop:",p_prop if random.random() < p_prop: spins[i] *= -1 if t % 1000 == 0: print sum(spins) if t > burn_in: occupancies += spins == 1 # print "magnetization:",np.sum(spins == 1) return occupancies / (iterations - burn_in)
def cftp_ising(hs, J, replicas): samples = [cftp(hs, J) for i in verbose_gen(xrange(replicas))] cols = transpose(samples) return [mean(map(lambda c: (c + 1) / 2.0, col)) for col in cols]
def gibbs_sample_many(ks,q,t,n): """Sample system (ks,q) by gibbs sampling at time t, for n trials""" G = len(ks) return map(mean,transpose([ss_from_xs(gibbs_sample_iterate(ks,[G]*q,t),G) for i in verbose_gen(xrange(n))]))
def sequential_sample_many(ks,q,n): return map(mean,transpose([sequential_sample_ref(ks,q) for i in verbose_gen(xrange(n))]))
def write_chip_seq_data(chip_seq_data,filename): genome = get_ecoli_genome() with open(filename,'w') as f: for (i,(base,val)) in verbose_gen(enumerate(zip(genome,chip_seq_data)),10**5): f.write("%s,%s,%s\n"%(i,base,val))
def max_in_window(scores,k): """Return max in window of radius k over circular array of scores""" max_scores = np.copy(scores) for j in verbose_gen(xrange(-k,k+1)): max_scores = np.maximum(max_scores,np.roll(scores,j)) return max_scores
def make_chip_dataset(num_cells): return concat([chip(genome,rfd_xs(ps),MEAN_FRAGMENT_LENGTH) for i in verbose_gen(xrange(num_cells))])
def initialize_array(): for i in verbose_gen(xrange(N), modulus=1000): Z(i, Q)
def chip_ps_ref(ps,mean_frag_length,cells=10000): """Do a chip seq experiment given the distribution ps""" G = len(ps) return concat(chip_ps(rfd_xs(ps),mean_frag_length) for cell in verbose_gen(xrange(cells)))
def chip_ps_np(ps,mean_frag_length,cells=10000,verbose=False): """Do a chip seq experiment given the distribution ps""" w = 10 G = len(ps)# + w - 1 #XXX HACK cell_iterator = verbose_gen(xrange(cells),modulus=1000) if verbose else xrange(cells) return concat(chip(G,rfd_xs_np(ps),mean_frag_length) for cell in cell_iterator)
def chip_ps_spec(ps,mean_frag_length,cells=10000): return concat(chip_ps_spec_single_cell(ps,mean_frag_length) for i in verbose_gen(xrange(cells)))
def kmers(L): return verbose_gen(product(*[bases for i in range(L)]),modulus=10000)
def predict_chip_ps2(ps,mean_frag_length,cells=10000): G = len(ps) eff_lamb = 1.0/(mean_frag_length-0.5) # empircally determined; worrisome return [cells*alo2([p*tent(x,i,mean_frag_length) for i,p in enumerate(ps)]) for x in verbose_gen(range(G))]
def show_chip_shadow(G,endpoints,mean_frag_length,cells=10000,trials=10): lamb = 1.0/mean_frag_length [plt.plot(map_reads(concat([chip(G,endpoints,mean_frag_length) for i in range(cells)]),G),color='b') for i in verbose_gen(range(trials))]