Example #1
0
def infer_synthetic_energy_model(num_reads=100000):
    """the whole show: infer the energy model from true reads"""
    G = len(genome)
    w = 10
    true_matrix = [[-2, 0, 0, 0] for _ in range(w)]
    true_mu = -20
    true_eps = score_genome_np(true_matrix, genome)
    true_ps = fd_solve_np(true_eps, true_mu)
    MFL = 250 #mean frag length = 250bp
    lamb = 1/250.0
    true_reads = reads_from_ps(true_ps, MFL, min_seq_len=75, num_reads=num_reads)
    true_rdm = density_from_reads(true_reads, G)
    init_matrix = random_energy_matrix(w)
    init_mu = -20
    init_scores = score_genome_np(init_matrix, genome)
    init_state = ((init_matrix, init_mu), init_scores)
    logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads))
    rprop = lambda state:complete_rprop(state, genome)
    verbose = True
    iterations = 50000
    print "true_ll:", logf(((true_matrix, true_mu), true_eps))
    matrix_chain = mh(logf, proposal=rprop, x0=init_state, dprop=log_dprop, 
                      capture_state=capture_state, verbose=verbose, 
                      use_log=True, iterations=iterations, modulus=100)
    return matrix_chain
Example #2
0
def cumsum_test():
    arca_reads = get_arca_reads(1000000)
    true_rdm = density_from_reads(arca_reads, G)
    pssm = make_pssm(Escherichia_coli.ArcA)
    comb_rdm = true_rdm[0] + true_rdm[1]
    print "fwd_scores"
    fwd_scores = score_genome_np(pssm, genome)
    print "rev_scores"
    rev_scores = score_genome_np(pssm, wc(genome))
    scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores))
    probs = np.exp(scores)/np.sum(np.exp(scores))
    print "sorting scores"
    score_js = sorted_indices(scores)[::-1] # order scores from greatest to least
    print "sorting probs"
    prob_js = sorted_indices(probs)[::-1] # ditto
    plt.plot(cumsum(rslice(comb_rdm, score_js)), label="scores")
    plt.plot(cumsum(rslice(comb_rdm, prob_js)), label="boltzmann probs")
    comb_rdm_copy = list(comb_rdm)
    controls = 5
    for i in range(controls):
        print i
        random.shuffle(comb_rdm_copy)
        plt.plot(cumsum(comb_rdm_copy), color='r')
    plt.legend(loc=0)
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.show()
Example #3
0
def gc_bias_test():
    arca_reads = get_arca_reads(1000000)
    true_rdm = density_from_reads(arca_reads, G)
    comb_rdm = true_rdm[0] + true_rdm[1]
    gc_pssm = [[0, 1, 1, 0]] * 10
    gc_scores = score_genome_np(gc_pssm, genome)
    plt.scatter(gc_scores, comb_rdm, marker='.')
Example #4
0
def regress_on_read_density():
    import statsmodels.api as sm
    """Try to regress read density right off of sequence data"""
    w = 10
    arca_reads = get_arca_reads()
    true_rdm = density_from_reads(arca_reads, G)
    y = true_rdm[0] + true_rdm[1]
    X = dummify_genome(genome, w)
    #sm.add_constant(X)
    model = sm.OLS(y, X)
    results = model.fit()
    print results.summary()
    return results
Example #5
0
def arca_motif_comparison():
    arca_reads = get_arca_reads()
    true_rdm = density_from_reads(arca_reads, G)
    pssm = make_pssm(Escherichia_coli.ArcA)
    plt.plot(true_rdm[0])
    plt.plot(true_rdm[1])
    fwd_scores, rev_scores = score_genome_np(pssm, genome)
    scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores))
    sites = concat([(site, wc(site)) for site in Escherichia_coli.ArcA])
    site_locations = [m.start(0) for site in sites
                      for m in re.finditer(site, genome)]
    site_locations_np = np.zeros(G)
    for site_loc in site_locations:
        site_locations_np[site_loc] = 1
    plt.plot(site_locations_np)
    plt.plot(scores)
Example #6
0
def complete_log_likelihood(state, true_rdm, lamb, num_reads=100000):
    """Compute log likelihood of true_rdm given energy model (state).
    
    (1) Simulate reads from energy model.
    (2) compare simulated reads to true reads with read_log_likelihood."""
    print "num_reads:", num_reads, "%e" % num_reads
    (matrix, mu), all_eps = state
    ps = fd_solve_np(all_eps, mu)
    print "copy number:", np.sum(ps)
    G = len(ps)
    MFL = 1/lamb
    print "generating reads"
    proposed_reads = reads_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads)
    print "mapping reads"
    proposed_rdm = density_from_reads(proposed_reads, G)
    #proposed_rdm = density_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads)
    return rdm_log_likelihood(true_rdm, proposed_rdm)
Example #7
0
def infer_arca_energy_model(num_reads=1000000):
    """the whole show: infer the energy model from true reads"""
    true_reads = get_arca_reads(num_reads)
    G = len(genome)
    lamb = 1/250.0
    true_rdm = density_from_reads(true_reads, G)
    w = 10
    init_matrix = random_energy_matrix(w)
    init_mu = -20
    init_scores = score_genome_np(init_matrix, genome)
    init_state = ((init_matrix, init_mu), init_scores)
    logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads))
    rprop = lambda state:complete_rprop(state, genome)
    verbose = True
    iterations = 50000
    matrix_chain = mh(logf, proposal=rprop, x0=init_state, dprop=log_dprop, 
                      capture_state=capture_state, verbose=verbose, 
                      use_log=True, iterations=iterations, modulus=100)
    return matrix_chain
Example #8
0
def power_law_exploration():
    """Are the read densities for a given bin power-law distributed?"""
    print "getting arca reads"
    arca_reads = get_arca_reads(1000000)
    print "computing read density map"
    true_rdm = density_from_reads(arca_reads, G)
    comb_rdm = true_rdm[0] + true_rdm[1]
    pssm = make_pssm(Escherichia_coli.ArcA)
    print "scoring"
    fwd_scores, rev_scores = score_genome_np(pssm, genome)
    scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores))
    d = defaultdict(list)
    print "tabulating"
    for i in xrange(G):
        score = int(scores[i])
        d[score].append(comb_rdm[i])
    print "plotting"
    for key in sorted(d.keys()):
        counts = Counter(d[key])
        Z = float(sum(counts.values()))
        plt.plot(sorted(counts.keys()),
                 [counts[k]/Z for k in sorted(counts.keys())], label=key)
    plt.loglog()
    plt.show()
Example #9
0
def gradient_descent_experiment(true_rdm=None, num_reads=100000):
    #genome = get_ecoli_genome(at_lab=False)
    G = len(genome)
    w = 10
    mfl = 250
    lamb = 1.0/mfl
    simulating_data = False
    if true_rdm is None:
        simulating_data = True
        true_matrix = [[-2, 0, 0, 0] for i in range(w)]
        true_mu = -20
        true_eps = score_genome_np(true_matrix, genome)
        true_ps = fd_solve_np(true_eps, true_mu)
        true_reads = reads_from_ps(true_ps, mfl, min_seq_len=75, num_reads=num_reads)
        true_rdm = density_from_reads(true_reads, G)
        true_state = ((true_matrix, true_mu), true_eps)
    true_ll = logf(true_state) if simulating_data else None
    matrix = random_energy_matrix(w)
    mu = -20
    eps = score_genome_np(matrix, genome)
    init_state = ((matrix, mu), eps)
    logf = lambda state:timestamp(complete_log_likelihood(state, true_rdm, lamb, num_reads=num_reads))
    dw = 0.1
    dmu = 0.1
    old_ll = 0
    print "true_ll:", true_ll
    cur_ll = logf(init_state)
    eta = 10**-7 # learning rate
    iterations = 0
    while cur_ll > old_ll or iterations == 0:
        old_ll = cur_ll
        dmat = [[0]*4 for i in range(w)]
        for i in range(w):
            for j in range(4):
                print "i, j:", i, j
                new_mat = [row[:] for row in matrix]
                new_mat[i][j] += dw
                fwd_eps, rev_eps = eps
                new_eps = update_scores_np(fwd_eps, rev_eps, i, j, dw, w, genome)
                new_state = ((new_mat, mu), new_eps)
                new_ll = logf(new_state)
                print "cur ll, new_ll:",  cur_ll, new_ll, "(improvement)" if new_ll > cur_ll else "(worsening)"
                delta_w = (new_ll - cur_ll)/dw * eta
                print "delta_w:", delta_w
                dmat[i][j] = delta_w
        new_mu = mu + dmu
        new_state = ((matrix, new_mu), eps)
        new_ll = logf(new_state)
        print "mu:"
        print "cur ll, new_ll:",  cur_ll, new_ll, "(improvement)" if new_ll > cur_ll else "(worsening)"
        delta_mu = (new_ll - cur_ll)/dmu * eta
        print "delta_mu:", delta_mu
        old_matrix = [row[:] for row in matrix]
        for i in range(w):
            for j in range(4):
                matrix[i][j] += dmat[i][j]
        old_eps = np.array(eps)
        eps = score_genome_np(matrix, genome)
        old_mu = mu
        mu += delta_mu
        cur_state = ((matrix, mu), eps)
        cur_ll = logf(cur_state)
        print "\nresults of iteration %s:" % iterations
        pprint(matrix)
        print mu
        print "likelihood:", old_ll, "->", cur_ll
        iterations += 1
    return ((old_matrix, old_mu), old_eps)
Example #10
0
    proposed_reads = reads_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads)
    print "mapping reads"
    proposed_rdm = density_from_reads(proposed_reads, G)
    #proposed_rdm = density_from_ps(ps, MFL, min_seq_len=75, num_reads=num_reads)
    return rdm_log_likelihood(true_rdm, proposed_rdm)

def rdm_from_state((matrix, mu), num_reads=100000, eps=None, mfl=250):
    if eps is None:
        print "scoring genome"
        eps = score_genome_np(matrix, genome)
    print "solving ps"
    ps = fd_solve_np(eps, mu)
    print "generating reads"
    reads = reads_from_ps(ps, mfl, min_seq_len=75, num_reads=num_reads)
    print "mapping reads"
    rdm = density_from_reads(reads, G)
    return rdm

def plot_state(state, num_reads=100000):
    a, b = state
    if type(a) is tuple: # if state consists of ((matrix, mu), eps)
        matrix, mu = a
        eps = b
        rdm = rdm_from_state((matrix, mu), num_reads, eps)
    else:
        matrix, mu = a, b
        eps = score_genome_np(matrix, genome)
        rdm = rdm_from_state((matrix, mu), num_reads, eps)
    plt.plot(rdm[0])
    plt.plot(rdm[1])
        
Example #11
0
def frag_density(frags,G):
    fwd_map,rev_map = density_from_reads(frag_sabot(frags),G=G)
    return fwd_map