Exemple #1
0
def main(G=5000000,iterations=50000,init_matrix=None,init_mu=None,verbose=True):
    """Test case for FD-inference"""
    print "generating genome"
    genome = random_site(G)
    print "generating eps"
    eps = score_genome_np(TRUE_ENERGY_MATRIX,genome)
    min_mu,max_mu = -40,0
    mu = bisect_interval(lambda mu:np.sum(fd_solve_np(eps,mu))-q,min_mu,max_mu,verbose=True,tolerance=1e-1)
    print "computing ps"
    true_ps = fd_solve_np(eps,mu)
    print "true q:",np.sum(true_ps)
    print "generating chip dataset"
    mapped_reads = np.array(map_reads_np(chip_ps_np(true_ps,MEAN_FRAGMENT_LENGTH,NUM_CELLS_ORIGINAL),G))
    print "finished chip dataset"
    if init_matrix is None:
        init_matrix = random_energy_matrix(w)
    if init_mu is None:
        init_mu = -20#random.random()*40 - 20
    init_scores = score_genome_np(init_matrix,genome)
    init_state = ((init_matrix,init_mu),init_scores)
    logf = lambda state:complete_log_likelihood(state,mapped_reads)
    print "true mu:",mu
    print "true log_likelihood:",logf(((TRUE_ENERGY_MATRIX,mu),eps))
    rprop = lambda state:complete_rprop(state,genome)
    print "hitting mh loop"
    matrix_chain = mh(logf,proposal=rprop,x0=init_state,dprop=log_dprop,capture_state=capture_state,verbose=verbose,use_log=True,iterations=iterations,modulus=100)
    return matrix_chain,genome,mapped_reads
Exemple #2
0
def compare(good_mat,good_mu,true_mat,true_mu,genome):
    true_eps = score_genome_np(true_mat,genome)
    good_eps = score_genome_np(good_mat,genome)
    true_ps = fd_solve_np(true_eps,true_mu)
    good_ps = fd_solve_np(good_eps,good_mu)
    fig = plt.figure()
    ax1 = fig.add_subplot(221)
    ax1.plot(true_ps)
    ax1.plot(good_ps)
    #axarr[0].set_title('Sharing X axis')
    ax2 = fig.add_subplot(222)
    ax2.scatter(true_ps, good_ps)
    ax2.plot([0,1],[0,1])
    # ax2.xlim(0,1)
    # ax2.ylim(0,1)
    def offset(mat):
            """Normalize each column so that max element is zero"""
            return [[r-max(row) for r in row] for row in mat]
    ax3 = fig.add_subplot(223)
    cax3 = ax3.imshow(transpose(true_mat),interpolation='none')
    fig.colorbar(cax3)
    ax4 = fig.add_subplot(224)
    cax4 = ax4.imshow(transpose(offset(good_mat)),interpolation='none')
    fig.colorbar(cax4)
    #print "Pearson r:",pearsonr(true_ps,good_ps)
    plt.show()
Exemple #3
0
def infer_synthetic_energy_model(num_reads=100000):
    """the whole show: infer the energy model from true reads"""
    G = len(genome)
    w = 10
    true_matrix = [[-2, 0, 0, 0] for _ in range(w)]
    true_mu = -20
    true_eps = score_genome_np(true_matrix, genome)
    true_ps = fd_solve_np(true_eps, true_mu)
    MFL = 250 #mean frag length = 250bp
    lamb = 1/250.0
    true_reads = reads_from_ps(true_ps, MFL, min_seq_len=75, num_reads=num_reads)
    true_rdm = density_from_reads(true_reads, G)
    init_matrix = random_energy_matrix(w)
    init_mu = -20
    init_scores = score_genome_np(init_matrix, genome)
    init_state = ((init_matrix, init_mu), init_scores)
    logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads))
    rprop = lambda state:complete_rprop(state, genome)
    verbose = True
    iterations = 50000
    print "true_ll:", logf(((true_matrix, true_mu), true_eps))
    matrix_chain = mh(logf, proposal=rprop, x0=init_state, dprop=log_dprop, 
                      capture_state=capture_state, verbose=verbose, 
                      use_log=True, iterations=iterations, modulus=100)
    return matrix_chain
Exemple #4
0
def complete_log_likelihood(state, true_rdm, lamb, num_reads=100000):
    """Compute log likelihood of true_rdm given energy model (state).
    
    (1) Simulate reads from energy model.
    (2) compare simulated reads to true reads with read_log_likelihood."""
    print "num_reads:", num_reads, "%e" % num_reads
    (matrix, mu), all_eps = state
    ps = fd_solve_np(all_eps, mu)
    print "copy number:", np.sum(ps)
    G = len(ps)
    MFL = 1/lamb
    print "generating reads"
    proposed_reads = reads_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads)
    print "mapping reads"
    proposed_rdm = density_from_reads(proposed_reads, G)
    #proposed_rdm = density_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads)
    return rdm_log_likelihood(true_rdm, proposed_rdm)
Exemple #5
0
def gradient_descent_experiment(true_rdm=None, num_reads=100000):
    #genome = get_ecoli_genome(at_lab=False)
    G = len(genome)
    w = 10
    mfl = 250
    lamb = 1.0/mfl
    simulating_data = False
    if true_rdm is None:
        simulating_data = True
        true_matrix = [[-2, 0, 0, 0] for i in range(w)]
        true_mu = -20
        true_eps = score_genome_np(true_matrix, genome)
        true_ps = fd_solve_np(true_eps, true_mu)
        true_reads = reads_from_ps(true_ps, mfl, min_seq_len=75, num_reads=num_reads)
        true_rdm = density_from_reads(true_reads, G)
        true_state = ((true_matrix, true_mu), true_eps)
    true_ll = logf(true_state) if simulating_data else None
    matrix = random_energy_matrix(w)
    mu = -20
    eps = score_genome_np(matrix, genome)
    init_state = ((matrix, mu), eps)
    logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads))
    dw = 0.1
    dmu = 0.1
    old_ll = 0
    print "true_ll:", true_ll
    cur_ll = logf(init_state)
    eta = 10**-7 # learning rate
    iterations = 0
    while cur_ll > old_ll or iterations == 0:
        old_ll = cur_ll
        dmat = [[0]*4 for i in range(w)]
        for i in range(w):
            for j in range(4):
                print "i, j:", i, j
                new_mat = [row[:] for row in matrix]
                new_mat[i][j] += dw
                fwd_eps, rev_eps = eps
                new_eps = update_scores_np(fwd_eps, rev_eps, i, j, dw, w, genome)
                new_state = ((new_mat, mu), new_eps)
                new_ll = logf(new_state)
                print "cur ll, new_ll:",  cur_ll, new_ll, "(improvement)" if new_ll > cur_ll else "(worsening)"
                delta_w = (new_ll - cur_ll)/dw * eta
                print "delta_w:", delta_w
                dmat[i][j] = delta_w
        new_mu = mu + dmu
        new_state = ((matrix, new_mu), eps)
        new_ll = logf(new_state)
        print "mu:"
        print "cur ll, new_ll:",  cur_ll, new_ll, "(improvement)" if new_ll > cur_ll else "(worsening)"
        delta_mu = (new_ll - cur_ll)/dmu * eta
        print "delta_mu:", delta_mu
        old_matrix = [row[:] for row in matrix]
        for i in range(w):
            for j in range(4):
                matrix[i][j] += dmat[i][j]
        old_eps = np.array(eps)
        eps = score_genome_np(matrix, genome)
        old_mu = mu
        mu += delta_mu
        cur_state = ((matrix, mu), eps)
        cur_ll = logf(cur_state)
        print "\nresults of iteration %s:" % iterations
        pprint(matrix)
        print mu
        print "likelihood:", old_ll, "->", cur_ll
        iterations += 1
    return ((old_matrix, old_mu), old_eps)
Exemple #6
0
    print "copy number:", np.sum(ps)
    G = len(ps)
    MFL = 1/lamb
    print "generating reads"
    proposed_reads = reads_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads)
    print "mapping reads"
    proposed_rdm = density_from_reads(proposed_reads, G)
    #proposed_rdm = density_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads)
    return rdm_log_likelihood(true_rdm, proposed_rdm)

def rdm_from_state((matrix, mu), num_reads=100000, eps=None, mfl=250):
    if eps is None:
        print "scoring genome"
        eps = score_genome_np(matrix, genome)
    print "solving ps"
    ps = fd_solve_np(eps, mu)
    print "generating reads"
    reads = reads_from_ps(ps, mfl, min_seq_len=75, num_reads=num_reads)
    print "mapping reads"
    rdm = density_from_reads(reads, G)
    return rdm

def plot_state(state, num_reads=100000):
    a, b = state
    if type(a) is tuple: # if state consists of ((matrix, mu), eps)
        matrix, mu = a
        eps = b
        rdm = rdm_from_state((matrix, mu), num_reads, eps)
    else:
        matrix, mu = a, b
        eps = score_genome_np(matrix, genome)
Exemple #7
0
def log_dprop(((matp,mup),epsp),((mat,mu),eps)):
    dmat = sum([xp - x for (rowp,row) in zip(matp,mat) for (xp,x) in zip(rowp,row)])
    dmu = mup - mu
    if dmat != 0:
        return log(1/2.0 * dnorm(dmat,0,MAT_SIGMA))
    else:
        return log(1/2.0 * dnorm(dmu,0,MAT_SIGMA))
        #return log(dnorm(dmat,0,MAT_SIGMA)) + log(dnorm(dmu,0,MU_SIGMA))
    
def capture_state((mat_and_mu,site_scores)):
    return mat_and_mu

def complete_log_likelihood(((matrix,mu),eps),mapped_reads,num_cells=NUM_CELLS_RECOVERED):
    """Compute log likelihood of matrix, given chip seq data"""
    print "entering complete log likelihood"
    ps = np.append(fd_solve_np(eps,mu),[0]*(w-1))
    G = len(ps)
    #print "G=",G
    # if random.random() < 1:#0.01:
    #     pprint(matrix)
    print "mean copy number:",np.sum(ps),"mu:",mu
    #print "predicting mapped_reads"
    #predicted_coverage_probability = predict_chip_ps4(ps,MEAN_FRAGMENT_LENGTH,1) # XXX HACK
    proposed_reads = map_reads_np(chip_ps_np(ps,MEAN_FRAGMENT_LENGTH,num_cells),G)
    #print "predicted mapped_reads"
    # add laplacian pseudocount: one observation of hit and miss each
    predicted_coverage_probability = (np.array(proposed_reads,dtype=float)+1)/(num_cells+2)
    #print "computing likelihood"
    #print "pearson correlation between true, recovered datasets:",pearsonr(proposed_reads,mapped_reads)
    ans = chip_seq_log_likelihood(predicted_coverage_probability,mapped_reads,NUM_CELLS_ORIGINAL)
    if True:#random.random() < 0.01: