def estimate_failures(samples, #samples from noisy labelers n_samples=10000, #number of samples to run MCMC for burn=None, #burn-in. Defaults to n_samples/2 thin=10, #thinning rate. Sample every k samples from markov chain alpha_p=1, beta_p=1, #beta parameters for true positive rate alpha_e=1, beta_e=10 #beta parameters for noise rates ): if burn is None: burn = n_samples / 2 S,N = samples.shape p = Beta('p', alpha=alpha_p, beta=beta_p) #prior on true label l = Bernoulli('l', p=p, size=S) e_pos = Beta('e_pos', alpha_e, beta_e, size=N) # error rate if label = 1 e_neg = Beta('e_neg', alpha_e, beta_e, size=N) # error rate if label = 0 @deterministic(plot=False) def noise_rate(l=l, e_pos=e_pos, e_neg=e_neg): #probability that a noisy labeler puts a label 1 return np.outer(l, 1-e_pos) + np.outer(1-l, e_neg) noisy_label = Bernoulli('noisy_label', p=noise_rate, size=samples.shape, value=samples, observed=True) variables = [l, e_pos, e_neg, p, noisy_label, noise_rate] model = MCMC(variables, verbose=3) model.sample(iter=n_samples, burn=burn, thin=thin) model.write_csv('out.csv', ['p', 'e_pos', 'e_neg']) p = np.median(model.trace('p')[:]) e_pos = np.median(model.trace('e_pos')[:],0) e_neg = np.median(model.trace('e_neg')[:],0) return p, e_pos, e_neg
def Model_twostage_fit_v2(n_TF, n_gene, p_gene_array, p_TF_gene_array, num_iter, num_burn, num_thin, prior_T, prior_T_method, r_TF_gene, a_TF_gene_h1, a_TF_gene_h0, a_gene): """ Assumptions: We allow learning of parameters """ a_gp = float(a_gene) if a_TF_gene_h0 == 'None': a_tg0 = Uniform('a_tg0', lower=0.5, upper=1) else: a_tg0 = float(a_TF_gene_h0) if a_TF_gene_h1 == 'None': a_tg1 = Uniform('a_tg1', lower=0, upper=0.5) else: a_tg1 = float(a_TF_gene_h1) p_T = float(prior_T) if r_TF_gene == 'None': r_tg = Uniform('r_tg', lower=0, upper=1) else: r_tg = float(r_TF_gene) p_gene = np.zeros(n_gene, dtype=object) #the ovserved variables T = np.zeros((n_TF, n_gene), dtype=object) #variables showing TF-gene-pheno relationship T_sum = np.zeros(n_gene, dtype=object) p_TF_gene = np.zeros((n_TF, n_gene), dtype=object) #p-value of correlation of gene TF for j in range(n_gene): for i in range(n_TF): T[i, j] = Bernoulli('T_%i_%i' %(i, j), p=p_T) #If T[i, j] = 0: then p_TF_gene is coming from a mixture of beta and uniform (r is the mixture param) @pymc.stochastic(name='p_TF_gene_%i_%i' %(i, j), dtype=float, observed=True) def temp_p_TF_gene(value=p_TF_gene_array[i, j], TF_gene_ind=T[i, j], a0=a_tg0, a1=a_tg1, r=r_tg) : if TF_gene_ind: out = pymc.distributions.beta_like(value, alpha=a1, beta=1) else: out = np.log(r * np.exp(pymc.distributions.beta_like(value, alpha=a1, beta=1)) + (1 - r) * np.exp(pymc.distributions.beta_like(value, alpha=a0, beta=1))) return out p_TF_gene[i, j] = temp_p_TF_gene #we define a deterministic function to find values of T @pymc.deterministic(name='T_sum_%i' %j, plot=False) def temp_T_sum(ind_vec=T[:,j]): return (np.sum(ind_vec)>0) T_sum[j] = temp_T_sum #If T_sum[j] == 0: then p_TF_gene is coming from a uniform; else, beta @pymc.stochastic(name='p_gene_%i' %j, dtype=float, observed=True) def temp_p_gene(value=p_gene_array[j], ind=T_sum[j], a=a_gp): if ind: out = pymc.distributions.beta_like(value, alpha=a, beta=1) else: out = pymc.distributions.uniform_like(value, 0, 1) return out p_gene[j] = temp_p_gene if a_gene == None and a_TF_gene_h0 == None and a_TF_gene_h1 == None: M5 = pymc.MCMC([T, T_sum, a_gp, a_tg0, a_tg1]) else: M5 = pymc.MCMC([T, T_sum]) M5.sample(iter=int(num_iter), burn=int(num_burn), thin=int(num_thin)) return(M5)
def set_models(self): """Define models for each group. :return: None """ for group in ['control', 'variant']: self.stochastics[group] = Bernoulli(group, self.stochastics[group + '_p'], value=getattr(self, group), observed=True)
print c,v print "num_pathways:", len(pathways) print "num_features:", len(features) print "num_evidence:", len(evidence) print "num_metfrag: ", len(metfrag_evidence) rate_prior = 0.5 #eps = Beta('eps', 0.005, 1) eps = 0.0001 ap = {p : Gamma('p_' + p, rate_prior, 1) for p in pathways} bmp = {p : {feat : Gamma('b_{' + p + ',' + feat + '}', ap[p],1) for feat in path_dict[p]} for p in pathways} y_bmp = {} g = {} def logp_f(f, b, eps): if f in evidence: return math.log(1 - math.e ** (-1 * b) + epsilon) if f in metfrag_evidence: a_p = (1.0 / (1 - metfrag_evidence[f])) - 1 return a_p * math.log(1 - math.e ** (-1 * b) + epsilon) - b return math.log(eps) - b psi = {} for feat, pathways in reverse_path_dict.iteritems(): y_bmp[feat] = sum([bmp[pname][feat] for pname in pathways]) g[feat] = Bernoulli('g_' + feat, 1 - math.e ** (-y_bmp[feat])) psi[feat] = pymc.Potential(logp = logp_f, name = 'psi_' + feat, parents = {'f' : feat, 'b' : y_bmp[feat], 'eps' : eps}, doc = 'hello world potential' )
metfrag_evidence = read.dict_of_set( read.metfrag_with_scores(observation_file, keep_zero_scores=False), metfrag & features - cofactors - evidence) evidence = {e: 1 for e in evidence} rate_prior = 0.5 ap = {p: Gamma('p_' + p, rate_prior, 1) for p in pathways} bmp = { p: { feat: Gamma('b_{' + p + ',' + feat + '}', ap[p], 1) for feat in path_dict[p] } for p in pathways } y_bmp = {} virtual = {} se_count = 0 for feat, pathways in reverse_path_dict.iteritems(): #g_bmp[feat] = Poisson('g_' + feat, sum([bmp[pname][feat] for pname in pathways])) y_bmp[feat] = Bernoulli( 'y_' + feat, 1 - math.e**-sum([bmp[pname][feat] for pname in pathways])) # if feat in evidence: # virtual[feat] = Bernoulli('ve_' + feat, ONE if (g_bmp[feat] != 0) else ZERO, value = 1, observed = True) # elif feat in metfrag_evidence: # se_count += 1 # e = metfrag_evidence[feat] # virtual[feat] = Bernoulli('vs_' + feat, e if (g_bmp[feat] != 0) else 1 - e, value = 1, observed = True)
evidence &= features reverse_path_dict = read.reverse_dict(path_dict) metfrag = read.metfrag(observation_file) metfrag_evidence = read.dict_of_set( read.metfrag_with_scores(observation_file, keep_zero_scores=False), metfrag & features - cofactors - evidence) evidence = {e: one for e in evidence} evidence.update(metfrag_evidence) features = list(features) print 'C05381' in evidence print evidence['C05381'] pi = 0.1 #l = [Beta('lambda_'+p, alpha = 1, beta = 1, value = 0.5) for p in pathways] l = 0.5 a_ps = [Bernoulli(path, p=l) for i, path in enumerate(pathways)] #a_ps = [Bernoulli(path, p = l[i]) for i, path in enumerate(pathways)] for i, p in enumerate(pathways): O[p] = {} active_path = (lambda x=a_ps[i]: u[1] if x else u[0]) u_ap = Lambda('u_ap' + str(i), active_path) for f in path_dict[p]: O[p][f] = (Bernoulli('o_{p=' + p + ',f=' + f + '}', p=u_ap), u_ap) def is_present(f_id, parents): """ Calculates y_f, the probability that a features appears in our sample. Args: f_id (int): feature id O (dict): O is a dict of dicts representing probability of each feature
def __init__(self, G=cycle_graph(9), beta=0.0): self.G, self.beta = G, beta self.x = [Bernoulli(str(v), 0.5, value=0) for v in G.nodes_iter()] self.psi = [self.IndepSetPotential(v, G[v]) for v in G.nodes_iter()] MCMC.__init__(self, [self.x, self.psi])
from pymc import MCMC, Matplot, Beta, Bernoulli, Lambda, Poisson, Uniform, deterministic, logp_of_set, logp_gradient_of_set n = 100000 theta = 2 pi = 0.4 y = [(random.random() < pi) * random.poisson(theta) for i in range(n)] def remcache(s): s._cache_depth = 0 s.gen_lazy_function() p = Beta('p', 1, 1) z = Bernoulli('z', p, value=array(y) > 0, plot=False) theta_hat = Uniform('theta_hat', 0, 100, value=3) t = z * theta counts = Poisson('counts', t, value=y, observed=True) model = [p, z, theta_hat, counts] #disable caching for all the nodes v = model + [t] for s in v: remcache(s) def pymc_logp(): return logp_of_set(model)
from pymc import Bernoulli, Lambda import pymc import numpy as np import parser u = [0.01, 0.8] l = 0.5 O = {} pathways = parser.pathways() features = parser.features() detected = parser.detected_features() evidence = parser.evidence() a_ps = [Bernoulli('a_' + str(i), p=l) for i in xrange(len(pathways))] for i, p in enumerate(pathways): O[i] = {} active_path = (lambda x=a_ps[i]: u[1] if x else u[0]) u_ap = Lambda('u_ap' + str(i), active_path) for f in p.mets: O[i][f] = (Bernoulli('o_{p=' + str(i) + ',f=' + str(f) + '}', p=u_ap), u_ap) def is_present(f_id, O): """ Calculates y_f, the probability that a features appears in our sample. Args: f_id (int): feature id O (dict): O is a dict of dicts representing probability of each feature in each pathway Returns: