def infer_synthetic_energy_model(num_reads=100000): """the whole show: infer the energy model from true reads""" G = len(genome) w = 10 true_matrix = [[-2, 0, 0, 0] for _ in range(w)] true_mu = -20 true_eps = score_genome_np(true_matrix, genome) true_ps = fd_solve_np(true_eps, true_mu) MFL = 250 #mean frag length = 250bp lamb = 1/250.0 true_reads = reads_from_ps(true_ps, MFL, min_seq_len=75, num_reads=num_reads) true_rdm = density_from_reads(true_reads, G) init_matrix = random_energy_matrix(w) init_mu = -20 init_scores = score_genome_np(init_matrix, genome) init_state = ((init_matrix, init_mu), init_scores) logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads)) rprop = lambda state:complete_rprop(state, genome) verbose = True iterations = 50000 print "true_ll:", logf(((true_matrix, true_mu), true_eps)) matrix_chain = mh(logf, proposal=rprop, x0=init_state, dprop=log_dprop, capture_state=capture_state, verbose=verbose, use_log=True, iterations=iterations, modulus=100) return matrix_chain
def main(G=5000000,iterations=50000,init_matrix=None,init_mu=None,verbose=True): """Test case for FD-inference""" print "generating genome" genome = random_site(G) print "generating eps" eps = score_genome_np(TRUE_ENERGY_MATRIX,genome) min_mu,max_mu = -40,0 mu = bisect_interval(lambda mu:np.sum(fd_solve_np(eps,mu))-q,min_mu,max_mu,verbose=True,tolerance=1e-1) print "computing ps" true_ps = fd_solve_np(eps,mu) print "true q:",np.sum(true_ps) print "generating chip dataset" mapped_reads = np.array(map_reads_np(chip_ps_np(true_ps,MEAN_FRAGMENT_LENGTH,NUM_CELLS_ORIGINAL),G)) print "finished chip dataset" if init_matrix is None: init_matrix = random_energy_matrix(w) if init_mu is None: init_mu = -20#random.random()*40 - 20 init_scores = score_genome_np(init_matrix,genome) init_state = ((init_matrix,init_mu),init_scores) logf = lambda state:complete_log_likelihood(state,mapped_reads) print "true mu:",mu print "true log_likelihood:",logf(((TRUE_ENERGY_MATRIX,mu),eps)) rprop = lambda state:complete_rprop(state,genome) print "hitting mh loop" matrix_chain = mh(logf,proposal=rprop,x0=init_state,dprop=log_dprop,capture_state=capture_state,verbose=verbose,use_log=True,iterations=iterations,modulus=100) return matrix_chain,genome,mapped_reads
def infer_arca_energy_model(num_reads=1000000): """the whole show: infer the energy model from true reads""" true_reads = get_arca_reads(num_reads) G = len(genome) lamb = 1/250.0 true_rdm = density_from_reads(true_reads, G) w = 10 init_matrix = random_energy_matrix(w) init_mu = -20 init_scores = score_genome_np(init_matrix, genome) init_state = ((init_matrix, init_mu), init_scores) logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads)) rprop = lambda state:complete_rprop(state, genome) verbose = True iterations = 50000 matrix_chain = mh(logf, proposal=rprop, x0=init_state, dprop=log_dprop, capture_state=capture_state, verbose=verbose, use_log=True, iterations=iterations, modulus=100) return matrix_chain
def gradient_descent_experiment(true_rdm=None, num_reads=100000): #genome = get_ecoli_genome(at_lab=False) G = len(genome) w = 10 mfl = 250 lamb = 1.0/mfl simulating_data = False if true_rdm is None: simulating_data = True true_matrix = [[-2, 0, 0, 0] for i in range(w)] true_mu = -20 true_eps = score_genome_np(true_matrix, genome) true_ps = fd_solve_np(true_eps, true_mu) true_reads = reads_from_ps(true_ps, mfl, min_seq_len=75, num_reads=num_reads) true_rdm = density_from_reads(true_reads, G) true_state = ((true_matrix, true_mu), true_eps) true_ll = logf(true_state) if simulating_data else None matrix = random_energy_matrix(w) mu = -20 eps = score_genome_np(matrix, genome) init_state = ((matrix, mu), eps) logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads)) dw = 0.1 dmu = 0.1 old_ll = 0 print "true_ll:", true_ll cur_ll = logf(init_state) eta = 10**-7 # learning rate iterations = 0 while cur_ll > old_ll or iterations == 0: old_ll = cur_ll dmat = [[0]*4 for i in range(w)] for i in range(w): for j in range(4): print "i, j:", i, j new_mat = [row[:] for row in matrix] new_mat[i][j] += dw fwd_eps, rev_eps = eps new_eps = update_scores_np(fwd_eps, rev_eps, i, j, dw, w, genome) new_state = ((new_mat, mu), new_eps) new_ll = logf(new_state) print "cur ll, new_ll:", cur_ll, new_ll, "(improvement)" if new_ll > cur_ll else "(worsening)" delta_w = (new_ll - cur_ll)/dw * eta print "delta_w:", delta_w dmat[i][j] = delta_w new_mu = mu + dmu new_state = ((matrix, new_mu), eps) new_ll = logf(new_state) print "mu:" print "cur ll, new_ll:", cur_ll, new_ll, "(improvement)" if new_ll > cur_ll else "(worsening)" delta_mu = (new_ll - cur_ll)/dmu * eta print "delta_mu:", delta_mu old_matrix = [row[:] for row in matrix] for i in range(w): for j in range(4): matrix[i][j] += dmat[i][j] old_eps = np.array(eps) eps = score_genome_np(matrix, genome) old_mu = mu mu += delta_mu cur_state = ((matrix, mu), eps) cur_ll = logf(cur_state) print "\nresults of iteration %s:" % iterations pprint(matrix) print mu print "likelihood:", old_ll, "->", cur_ll iterations += 1 return ((old_matrix, old_mu), old_eps)