def main(G=5000000,iterations=50000,init_matrix=None,init_mu=None,verbose=True): """Test case for FD-inference""" print "generating genome" genome = random_site(G) print "generating eps" eps = score_genome_np(TRUE_ENERGY_MATRIX,genome) min_mu,max_mu = -40,0 mu = bisect_interval(lambda mu:np.sum(fd_solve_np(eps,mu))-q,min_mu,max_mu,verbose=True,tolerance=1e-1) print "computing ps" true_ps = fd_solve_np(eps,mu) print "true q:",np.sum(true_ps) print "generating chip dataset" mapped_reads = np.array(map_reads_np(chip_ps_np(true_ps,MEAN_FRAGMENT_LENGTH,NUM_CELLS_ORIGINAL),G)) print "finished chip dataset" if init_matrix is None: init_matrix = random_energy_matrix(w) if init_mu is None: init_mu = -20#random.random()*40 - 20 init_scores = score_genome_np(init_matrix,genome) init_state = ((init_matrix,init_mu),init_scores) logf = lambda state:complete_log_likelihood(state,mapped_reads) print "true mu:",mu print "true log_likelihood:",logf(((TRUE_ENERGY_MATRIX,mu),eps)) rprop = lambda state:complete_rprop(state,genome) print "hitting mh loop" matrix_chain = mh(logf,proposal=rprop,x0=init_state,dprop=log_dprop,capture_state=capture_state,verbose=verbose,use_log=True,iterations=iterations,modulus=100) return matrix_chain,genome,mapped_reads
def compare(good_mat,good_mu,true_mat,true_mu,genome): true_eps = score_genome_np(true_mat,genome) good_eps = score_genome_np(good_mat,genome) true_ps = fd_solve_np(true_eps,true_mu) good_ps = fd_solve_np(good_eps,good_mu) fig = plt.figure() ax1 = fig.add_subplot(221) ax1.plot(true_ps) ax1.plot(good_ps) #axarr[0].set_title('Sharing X axis') ax2 = fig.add_subplot(222) ax2.scatter(true_ps, good_ps) ax2.plot([0,1],[0,1]) # ax2.xlim(0,1) # ax2.ylim(0,1) def offset(mat): """Normalize each column so that max element is zero""" return [[r-max(row) for r in row] for row in mat] ax3 = fig.add_subplot(223) cax3 = ax3.imshow(transpose(true_mat),interpolation='none') fig.colorbar(cax3) ax4 = fig.add_subplot(224) cax4 = ax4.imshow(transpose(offset(good_mat)),interpolation='none') fig.colorbar(cax4) #print "Pearson r:",pearsonr(true_ps,good_ps) plt.show()
def infer_synthetic_energy_model(num_reads=100000): """the whole show: infer the energy model from true reads""" G = len(genome) w = 10 true_matrix = [[-2, 0, 0, 0] for _ in range(w)] true_mu = -20 true_eps = score_genome_np(true_matrix, genome) true_ps = fd_solve_np(true_eps, true_mu) MFL = 250 #mean frag length = 250bp lamb = 1/250.0 true_reads = reads_from_ps(true_ps, MFL, min_seq_len=75, num_reads=num_reads) true_rdm = density_from_reads(true_reads, G) init_matrix = random_energy_matrix(w) init_mu = -20 init_scores = score_genome_np(init_matrix, genome) init_state = ((init_matrix, init_mu), init_scores) logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads)) rprop = lambda state:complete_rprop(state, genome) verbose = True iterations = 50000 print "true_ll:", logf(((true_matrix, true_mu), true_eps)) matrix_chain = mh(logf, proposal=rprop, x0=init_state, dprop=log_dprop, capture_state=capture_state, verbose=verbose, use_log=True, iterations=iterations, modulus=100) return matrix_chain
def complete_log_likelihood(state, true_rdm, lamb, num_reads=100000): """Compute log likelihood of true_rdm given energy model (state). (1) Simulate reads from energy model. (2) compare simulated reads to true reads with read_log_likelihood.""" print "num_reads:", num_reads, "%e" % num_reads (matrix, mu), all_eps = state ps = fd_solve_np(all_eps, mu) print "copy number:", np.sum(ps) G = len(ps) MFL = 1/lamb print "generating reads" proposed_reads = reads_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads) print "mapping reads" proposed_rdm = density_from_reads(proposed_reads, G) #proposed_rdm = density_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads) return rdm_log_likelihood(true_rdm, proposed_rdm)
def gradient_descent_experiment(true_rdm=None, num_reads=100000): #genome = get_ecoli_genome(at_lab=False) G = len(genome) w = 10 mfl = 250 lamb = 1.0/mfl simulating_data = False if true_rdm is None: simulating_data = True true_matrix = [[-2, 0, 0, 0] for i in range(w)] true_mu = -20 true_eps = score_genome_np(true_matrix, genome) true_ps = fd_solve_np(true_eps, true_mu) true_reads = reads_from_ps(true_ps, mfl, min_seq_len=75, num_reads=num_reads) true_rdm = density_from_reads(true_reads, G) true_state = ((true_matrix, true_mu), true_eps) true_ll = logf(true_state) if simulating_data else None matrix = random_energy_matrix(w) mu = -20 eps = score_genome_np(matrix, genome) init_state = ((matrix, mu), eps) logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads)) dw = 0.1 dmu = 0.1 old_ll = 0 print "true_ll:", true_ll cur_ll = logf(init_state) eta = 10**-7 # learning rate iterations = 0 while cur_ll > old_ll or iterations == 0: old_ll = cur_ll dmat = [[0]*4 for i in range(w)] for i in range(w): for j in range(4): print "i, j:", i, j new_mat = [row[:] for row in matrix] new_mat[i][j] += dw fwd_eps, rev_eps = eps new_eps = update_scores_np(fwd_eps, rev_eps, i, j, dw, w, genome) new_state = ((new_mat, mu), new_eps) new_ll = logf(new_state) print "cur ll, new_ll:", cur_ll, new_ll, "(improvement)" if new_ll > cur_ll else "(worsening)" delta_w = (new_ll - cur_ll)/dw * eta print "delta_w:", delta_w dmat[i][j] = delta_w new_mu = mu + dmu new_state = ((matrix, new_mu), eps) new_ll = logf(new_state) print "mu:" print "cur ll, new_ll:", cur_ll, new_ll, "(improvement)" if new_ll > cur_ll else "(worsening)" delta_mu = (new_ll - cur_ll)/dmu * eta print "delta_mu:", delta_mu old_matrix = [row[:] for row in matrix] for i in range(w): for j in range(4): matrix[i][j] += dmat[i][j] old_eps = np.array(eps) eps = score_genome_np(matrix, genome) old_mu = mu mu += delta_mu cur_state = ((matrix, mu), eps) cur_ll = logf(cur_state) print "\nresults of iteration %s:" % iterations pprint(matrix) print mu print "likelihood:", old_ll, "->", cur_ll iterations += 1 return ((old_matrix, old_mu), old_eps)
print "copy number:", np.sum(ps) G = len(ps) MFL = 1/lamb print "generating reads" proposed_reads = reads_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads) print "mapping reads" proposed_rdm = density_from_reads(proposed_reads, G) #proposed_rdm = density_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads) return rdm_log_likelihood(true_rdm, proposed_rdm) def rdm_from_state((matrix, mu), num_reads=100000, eps=None, mfl=250): if eps is None: print "scoring genome" eps = score_genome_np(matrix, genome) print "solving ps" ps = fd_solve_np(eps, mu) print "generating reads" reads = reads_from_ps(ps, mfl, min_seq_len=75, num_reads=num_reads) print "mapping reads" rdm = density_from_reads(reads, G) return rdm def plot_state(state, num_reads=100000): a, b = state if type(a) is tuple: # if state consists of ((matrix, mu), eps) matrix, mu = a eps = b rdm = rdm_from_state((matrix, mu), num_reads, eps) else: matrix, mu = a, b eps = score_genome_np(matrix, genome)
def log_dprop(((matp,mup),epsp),((mat,mu),eps)): dmat = sum([xp - x for (rowp,row) in zip(matp,mat) for (xp,x) in zip(rowp,row)]) dmu = mup - mu if dmat != 0: return log(1/2.0 * dnorm(dmat,0,MAT_SIGMA)) else: return log(1/2.0 * dnorm(dmu,0,MAT_SIGMA)) #return log(dnorm(dmat,0,MAT_SIGMA)) + log(dnorm(dmu,0,MU_SIGMA)) def capture_state((mat_and_mu,site_scores)): return mat_and_mu def complete_log_likelihood(((matrix,mu),eps),mapped_reads,num_cells=NUM_CELLS_RECOVERED): """Compute log likelihood of matrix, given chip seq data""" print "entering complete log likelihood" ps = np.append(fd_solve_np(eps,mu),[0]*(w-1)) G = len(ps) #print "G=",G # if random.random() < 1:#0.01: # pprint(matrix) print "mean copy number:",np.sum(ps),"mu:",mu #print "predicting mapped_reads" #predicted_coverage_probability = predict_chip_ps4(ps,MEAN_FRAGMENT_LENGTH,1) # XXX HACK proposed_reads = map_reads_np(chip_ps_np(ps,MEAN_FRAGMENT_LENGTH,num_cells),G) #print "predicted mapped_reads" # add laplacian pseudocount: one observation of hit and miss each predicted_coverage_probability = (np.array(proposed_reads,dtype=float)+1)/(num_cells+2) #print "computing likelihood" #print "pearson correlation between true, recovered datasets:",pearsonr(proposed_reads,mapped_reads) ans = chip_seq_log_likelihood(predicted_coverage_probability,mapped_reads,NUM_CELLS_ORIGINAL) if True:#random.random() < 0.01: