def make_model(kappa_a=1., kappa_b=1., sigma_a=1., sigma_b=.1): """ A PyMC version of the dimensionless catalysis model. """ gamma = 1. log_kappa = pm.Normal('log_kappa', 0., 1., size=5) log_sigma = pm.Normal('log_sigma', -1., 1.) kappa = pm.exp(log_kappa) sigma = pm.exp(log_sigma) y = load_dimensionless_catalysis_data() f = CatalysisModelDMNLESS() @pm.deterministic def model_output(log_kappa=log_kappa): return f(log_kappa)['f'] @pm.stochastic(observed=True) def output(value=y, model_output=model_output, sigma=sigma, gamma=gamma): return gamma * pm.normal_like(y, model_output, 1. / (sigma ** 2.)) return locals()
def _mu_lognorm(self, mu, logsigma): """ Transform a gaussian mu to the lognormal mu Parameters ---------- mu - float the mean of a gaussian variable logsigma - float sigma of a gaussian variable Returns ------- float mu """ sigma = pymc.exp(logsigma) return pymc.log(mu**2 / pymc.sqrt(sigma**2 + mu**2))
def _tau_lognorm(self, mu, logsigma): """ Get lognormal tau from gaussian parameters Parameters ---------- mu - float the mean of a gaussian variable logsigma - float sigma of a gaussian variable Returns ------- """ sigma = pymc.exp(logsigma) return pymc.sqrt(pymc.log(1.0 + (sigma/mu)**2))**(-2)
# $$ y \sim \textrm{Normal}(\textrm{exp}(x),2)$$ # $$ z \sim \textrm{Normal}(x + y,0.75)$$ # # The aim here is to get posteriors over $x$ and $y$ given the data we have about $z$ (`zdata`). # # We create a new `Model` objects, and do operations within its context. The `with` lets PyMC know this model is the current model of interest. # # We construct new random variables with the constructor for its prior distribution such as `Normal` while within a model context (inside the `with`). When you make a random variable it is automatically added to the model. The constructor returns a Theano variable. # # Using the constructor may specify the name of the random variable, the parameters of a random variable's prior distribution, as well as the shape of the random variable. We can specify that a random variable is observed by specifying the data that was observed. # In[3]: with pm.Model() as model: x = pm.Normal('x', mu=0., sd=1) y = pm.Normal('y', mu=pm.exp(x), sd=2., shape=(ndims, 1)) # here, shape is telling us it's a vector rather than a scalar. z = pm.Normal('z', mu=x + y, sd=.75, observed=zdata) # shape is inferred from zdata # A parenthetical note on the parameters for the normal. Variance is encoded as `tau`, indicating precision, which is simply inverse variance (so $\tau=\sigma^{-2}$ ). This is used because the gamma function is the conjugate prior for precision, and must be inverted to get variance. Encoding in terms of precision saves the inversion step in cases where variance is actually modeled using gamma as a prior. # Fit Model # --------- # We need a starting point for our sampling. The `find_MAP` function finds the maximum a posteriori point (MAP), which is often a good choice for starting point. `find_MAP` uses an optimization algorithm (`scipy.optimize.fmin_l_bfgs_b`, or [BFGS](http://en.wikipedia.org/wiki/BFGS_method), by default) to find the local maximum of the log posterior. # # Note that this `with` construction is used again. Functions like `find_MAP` and `HamiltonianMC` need to have a model in their context. `with` activates the context of a particular model within its block. # In[4]: with model: start = pm.find_MAP()
import matplotlib.pyplot as plt from plot_post import plot_post # THE DATA. N = 30 z = 8 y = np.repeat([1, 0], [z, N - z]) # THE MODEL. with pm.Model() as model: # Hyperprior on model index: model_index = pm.DiscreteUniform('model_index', lower=0, upper=1) # Prior nu = pm.Normal('nu', mu=0, tau=0.1) # it is posible to use tau or sd eta = pm.Gamma('eta', .1, .1) theta0 = 1 / (1 + pm.exp(-nu)) # theta from model index 0 theta1 = pm.exp(-eta) # theta from model index 1 theta = pm.switch(pm.eq(model_index, 0), theta0, theta1) # Likelihood y = pm.Bernoulli('y', p=theta, observed=y) # Sampling start = pm.find_MAP() steps = [pm.Metropolis([i]) for i in model.unobserved_RVs[1:]] steps.append(pm.ElemwiseCategoricalStep(var=model_index, values=[0, 1])) trace = pm.sample(10000, steps, start=start, progressbar=False) # EXAMINE THE RESULTS. burnin = 1000 thin = 5 ## Print summary for each trace
# $$ y \sim \textrm{Normal}(\textrm{exp}(x),2)$$ # $$ z \sim \textrm{Normal}(x + y,0.75)$$ # # The aim here is to get posteriors over $x$ and $y$ given the data we have about $z$ (`zdata`). # # We create a new `Model` objects, and do operations within its context. The `with` lets PyMC know this model is the current model of interest. # # We construct new random variables with the constructor for its prior distribution such as `Normal` while within a model context (inside the `with`). When you make a random variable it is automatically added to the model. The constructor returns a Theano variable. # # Using the constructor may specify the name of the random variable, the parameters of a random variable's prior distribution, as well as the shape of the random variable. We can specify that a random variable is observed by specifying the data that was observed. # In[3]: with pm.Model() as model: x = pm.Normal('x', mu=0., sd=1) y = pm.Normal('y', mu=pm.exp(x), sd=2., shape=( ndims, 1)) # here, shape is telling us it's a vector rather than a scalar. z = pm.Normal('z', mu=x + y, sd=.75, observed=zdata) # shape is inferred from zdata # A parenthetical note on the parameters for the normal. Variance is encoded as `tau`, indicating precision, which is simply inverse variance (so $\tau=\sigma^{-2}$ ). This is used because the gamma function is the conjugate prior for precision, and must be inverted to get variance. Encoding in terms of precision saves the inversion step in cases where variance is actually modeled using gamma as a prior. # Fit Model # --------- # We need a starting point for our sampling. The `find_MAP` function finds the maximum a posteriori point (MAP), which is often a good choice for starting point. `find_MAP` uses an optimization algorithm (`scipy.optimize.fmin_l_bfgs_b`, or [BFGS](http://en.wikipedia.org/wiki/BFGS_method), by default) to find the local maximum of the log posterior. # # Note that this `with` construction is used again. Functions like `find_MAP` and `HamiltonianMC` need to have a model in their context. `with` activates the context of a particular model within its block. # In[4]:
from __future__ import division import pymc import numpy from scipy.integrate import odeint W = 110 # kg log_t_max = pymc.Normal('log_t_max', mu=numpy.log(60), tau=100) t_max = pymc.exp(log_t_max) log_MCR_sub_I = pymc.Normal('log_MCR_sub_I', mu=numpy.log(0.01),tau=100) MCR_sub_I = pymc.exp(log_MCR_sub_I) log_ins_sub_c = pymc.Normal('log_ins_sub_c', mu=numpy.log(36),tau=100) ins_sub_c = pymc.exp(log_ins_sub_c) u_of_t_min_0_15 = 1.4/60 bolus_delivery_at_t_equals_0 = 0 IOB = 1.3 i_sub_1_of_t_0 = (u_of_t_min_0_15*t_max) + bolus_delivery_at_t_equals_0 i_sub_2_of_t_0 = i_sub_1_of_t_0 + IOB tspan_0_15 = 0,15
def build_model(data: pd.DataFrame): data['Date'] = [pd.to_datetime(date) for date in data['Date']] # setting hyper-parameters: a_i, b_i, c_i, d_i, g, h # N = 20 --> number of teams teams = sorted(data.HomeTeam.unique()) n = len(teams) ab_hyper, cd_hyper = [(1, 1)] * n, [(1, 1)] * n g, h = 1, 1 # prior for alpha_i, attack alpha_1 = pymc.Gamma(name='alpha_1', alpha=ab_hyper[0][0], beta=ab_hyper[0][1], doc=teams[0] + '(attack)') alpha = np.empty(n, dtype=object) alpha[0] = alpha_1 for i in range(1, n): alpha[i] = pymc.Gamma( name='alpha_%i' % (i + 1), alpha=ab_hyper[i][0], beta=ab_hyper[i][1], doc=teams[i] + '(attack)') # prior for beta_i, defence beta_1 = pymc.Gamma( name='beta_1', alpha=cd_hyper[0][0], beta=cd_hyper[0][1], doc=teams[0] + '(defence)') beta = np.empty(n, dtype=object) beta[0] = beta_1 for i in range(1, n): beta[i] = pymc.Gamma( name='beta_%i' % (i + 1), alpha=cd_hyper[i][0], beta=cd_hyper[i][1], doc=teams[i] + '(defence)') # prior for lambda_value --> default: exists home advantage lambda_value = pymc.Gamma( name='lambda_value', alpha=g, beta=h, doc='home advantage') """ alpha_i * beta_j * lambda_value, beta_i * alpha_j, for each match in the dataset """ # home team index i_s = [teams.index(t) for t in data.HomeTeam] # away team index j_s = [teams.index(t) for t in data.AwayTeam] # deterministic, determined by alpha_i, alpha_j, beta_i, beta_j, # lambda_value home_scoring_strength = np.array([alpha[i] for i in i_s]) * \ np.array([beta[j] for j in j_s]) * \ np.array(lambda_value) away_scoring_strength = np.array([beta[i] for i in i_s]) * \ np.array([alpha[j] for j in j_s]) # params = zip(home_scoring_strength, away_scoring_strength) # likelihood home_score = pymc.Poisson('home_score', home_scoring_strength, value=data.FTHG, observed=True) away_score = pymc.Poisson('away_score', away_scoring_strength, value=data.FTAG, observed=True) t_now = data.Date[data.index[-1]] + pd.Timedelta('1 days 00:00:00') t_diff = np.array([item.days for item in (t_now - data.Date)]) time_weighting = pymc.exp(-t_diff * 0.01) likelihood = (home_score * away_score) ** time_weighting # wrap the model model = pymc.MCMC([likelihood, alpha, beta, lambda_value]) # run the simulation model.sample(iter=5000, burn=100, thin=10, verbose=False, progress_bar=True) # estimated_params estimated_params = pd.DataFrame({ 'team': teams, 'alpha(attack)': [0.0] * n, 'beta(defence)': [0.0] * n}, columns=['team', 'alpha(attack)', 'beta(defence)']) for p in alpha: estimated_params.loc[ estimated_params['team'] == p.__doc__.split('(')[0], 'alpha(attack)'] = round(model.trace(p.__name__)[:].mean(), 2) for p in beta: estimated_params.loc[ estimated_params['team'] == p.__doc__.split('(')[0], 'beta(defence)'] = round(model.trace(p.__name__)[:].mean(), 2) estimated_gamma = lambda_value logger.info(estimated_params) return estimated_params, estimated_gamma
import matplotlib.pyplot as plt from plot_post import plot_post # THE DATA. N = 30 z = 8 y = np.repeat([1, 0], [z, N-z]) # THE MODEL. with pm.Model() as model: # Hyperprior on model index: model_index = pm.DiscreteUniform('model_index', lower=0, upper=1) # Prior nu = pm.Normal('nu', mu=0, tau=0.1) # it is posible to use tau or sd eta = pm.Gamma('eta', .1, .1) theta0 = 1 / (1 + pm.exp(-nu)) # theta from model index 0 theta1 = pm.exp(-eta) # theta from model index 1 theta = pm.switch(pm.eq(model_index, 0), theta0, theta1) # Likelihood y = pm.Bernoulli('y', p=theta, observed=y) # Sampling start = pm.find_MAP() steps = [pm.Metropolis([i]) for i in model.unobserved_RVs[1:]] steps.append(pm.ElemwiseCategoricalStep(var=model_index,values=[0,1])) trace = pm.sample(10000, steps, start=start, progressbar=False) # EXAMINE THE RESULTS. burnin = 1000 thin = 5
# coding: utf-8 import pymc as pm import numpy as np ndims = 2 nobs = 20 xtrue = np.random.normal(scale=2., size=1) ytrue = np.random.normal(loc=np.exp(xtrue), scale=1, size=(ndims, 1)) zdata = np.random.normal(loc=xtrue + ytrue, scale=.75, size=(ndims, nobs)) with pm.Model() as model: x = pm.Normal('x', mu=0., sd=1) y = pm.Normal('y', mu=pm.exp(x), sd=2., shape=(ndims, 1)) # here, shape is telling us it's a vector rather than a scalar. z = pm.Normal('z', mu=x + y, sd=.75, observed=zdata) # shape is inferred from zdata with model: start = pm.find_MAP() print("MAP found:") print("x:", start['x']) print("y:", start['y']) print("Compare with true values:") print("ytrue", ytrue) print("xtrue", xtrue) with model: step = pm.NUTS()
returns.plot(title='return of NIKKEI index close price',figsize=(30,8)) nreturns=np.array(returns[1:])[::-1] import pymc as pm from pymc.distributions.timeseries import GaussianRandomWalk from scipy.sparse import csc_matrix from scipy import optimize with pm.Model() as model: sigma, log_sigma = model.TransformedVar('sigma', pm.Exponential.dist(1./.02, testval=.1), pm.logtransform) nu = pm.Exponential('nu', 1./10) s = GaussianRandomWalk('s', sigma**-2, shape=len(nreturns)) r = pm.T('r', nu, lam=pm.exp(-2*s), observed=nreturns) with model: start = pm.find_MAP(vars=[s], fmin=optimize.fmin_l_bfgs_b) step = pm.NUTS(scaling=start) trace = pm.sample(2000, step, start,progressbar=False) plt.plot(trace[s][::10].T,'b', alpha=.03) plt.title('log volatility') with model: pm.traceplot(trace, model.vars[:2]) exps=np.exp(trace[s][::10].T) plt.plot(returns[:600][::-1])
def __init__(self, df): """ Parameters ---------- df - pandas dataframe Returns ------- """ assert type(df) == pd.DataFrame self.logd = dict() sigma_guess = 0.2 logsigma_chx = pymc.Uniform("Sigma cyclohexane", -4., 4., numpy.log(sigma_guess)) logsigma_pbs = pymc.Uniform("Sigma buffer", -4., 4., numpy.log(sigma_guess)) logsigma_ms_chx = pymc.Uniform("Sigma MS cyclohexane", -4., 4., numpy.log(sigma_guess)) logsigma_ms_pbs = pymc.Uniform("Sigma MS buffer", -4., 4., numpy.log(sigma_guess)) self.model = dict(logsigma_chx=logsigma_chx, logsigma_pbs=logsigma_pbs, logsigma_ms_chx=logsigma_ms_chx, logsigma_ms_pbs=logsigma_ms_pbs) # Every compound for compound, compound_group in df.groupby("Sample Name"): # Concentration in each solvent phase for phase, phase_group in compound_group.groupby("Solvent"): phase = phase.lower() parameter_name = 'log10_{0}_{1}'.format(compound, phase) mean_concentration = phase_group["Area/Volume"].mean() # logsig = numpy.log(phase_group["Area/Volume"].std()) min_concentration = 1/2.0 max_concentration = 1.e8 # The log10 of the concentration is modelled with a uniform prior self.model[parameter_name] = pymc.Uniform(parameter_name, lower=numpy.log10(min_concentration), upper=numpy.log10(max_concentration), value=numpy.log10(mean_concentration)) # Corresponds to independent repeats for (batch, repeat), repeat_group in phase_group.groupby(["Set", "Repeat"]): repeat_parameter_name = '{0}_{1}_{2}-{3}'.format(compound, phase, batch, repeat) mu = pymc.Lambda(repeat_parameter_name + "-MU", lambda mu=pow(10.0, self.model[parameter_name]), ls=pow(10.0,self.model[parameter_name])*pymc.exp(self.model["logsigma_{}".format(phase)]): self._mu_lognorm(mu, ls)) tau = pymc.Lambda(repeat_parameter_name + "-TAU", lambda mu=pow(10.0,self.model[parameter_name]), ls=pow(10.0,self.model[parameter_name])*pymc.exp(self.model["logsigma_{}".format(phase)]): self._tau_lognorm(mu, ls)) # True concentration of independent repeats self.model[repeat_parameter_name] = pymc.Lognormal(repeat_parameter_name, mu=mu, tau=tau, value=mean_concentration) # likelihood of each observation for replicate, repl_group in repeat_group.groupby("Replicate"): replicate_parameter_name = '{0}_{1}_{2}-{3}_{4}'.format(compound, phase, batch, repeat, replicate) # Extract the observed concentration assert len(repl_group) == 1 # failsafe value = repl_group["Area/Volume"] mu = pymc.Lambda(replicate_parameter_name + "-MU", lambda mu=self.model[repeat_parameter_name], ls=self.model[repeat_parameter_name]*pymc.exp(self.model["logsigma_ms_{}".format(phase)]): self._mu_lognorm(mu, ls)) tau = pymc.Lambda(replicate_parameter_name + "-TAU", lambda mu=self.model[repeat_parameter_name], ls=self.model[repeat_parameter_name]*pymc.exp(self.model["logsigma_ms_{}".format(phase)]): self._tau_lognorm(mu, ls)) # Observed concentration from replicate experiment self.model[replicate_parameter_name] = pymc.Lognormal(replicate_parameter_name, mu=mu, tau=tau, value=value, observed=True) #, value=1.0) self.logd[compound] = pymc.Lambda("LogD_{}".format(compound), lambda c=self.model["log10_{}_chx".format(compound)], p=self.model["log10_{}_pbs".format(compound)]: c-p)
returns.plot(title='return of NIKKEI index close price', figsize=(30, 8)) nreturns = np.array(returns[1:])[::-1] import pymc as pm from pymc.distributions.timeseries import GaussianRandomWalk from scipy.sparse import csc_matrix from scipy import optimize with pm.Model() as model: sigma, log_sigma = model.TransformedVar( 'sigma', pm.Exponential.dist(1. / .02, testval=.1), pm.logtransform) nu = pm.Exponential('nu', 1. / 10) s = GaussianRandomWalk('s', sigma**-2, shape=len(nreturns)) r = pm.T('r', nu, lam=pm.exp(-2 * s), observed=nreturns) with model: start = pm.find_MAP(vars=[s], fmin=optimize.fmin_l_bfgs_b) step = pm.NUTS(scaling=start) trace = pm.sample(2000, step, start, progressbar=False) plt.plot(trace[s][::10].T, 'b', alpha=.03) plt.title('log volatility') with model: pm.traceplot(trace, model.vars[:2]) exps = np.exp(trace[s][::10].T) plt.plot(returns[:600][::-1]) plt.plot(exps, 'r', alpha=.03)
def __init__(self, df, stock_concentration=10, stock_volume=10, chx_dilution_factor=0.1, chx_volume=500, pbs_volume=500, dilution=False): """ Parameters ---------- df - pd.DataFrame Dataframe from preprocessing stock_concentration - float concentration of dmso stock in mM stock_volume - float volume of dmso stock used, in uL (initial guess) chx_dilution_factor - float factor of dilution for cyclohexane into octanol as prep for ms step chx_volume - float volume of the chx phase in uL pbs_volume - float volume of the buffer phase in uL dilution - bool Placeholder for possible dilution parameter Notes ----- For internal concistency, specify all volumes in uL, all concentrations in mM Returns ------- """ # TODO sampl_67 is the same internal standard, adjust concentrations/volumes accordingly self.model = dict() self.chx_volume = chx_volume self.pbs_volume = pbs_volume self.stock_concentration = stock_concentration sigma_guess = 0.1 # Steady hand metrics for Bas' right hand # Note: Assume no left hand pipetting operations # self.model["Log Sigma volume"] = pymc.Uniform("Log Sigma volume", -4., 4., numpy.log(dispense_sigma_guess)) # Measurement error self.model["logsigma_ms_chx"] = pymc.Uniform("logsigma_ms_chx", -4., 4., numpy.log(sigma_guess)) self.model["logsigma_ms_pbs"] = pymc.Uniform("logsigma_ms_pbs", -4., 4., numpy.log(sigma_guess)) # Every compound for compound, compound_group in df.groupby("Sample Name"): # Get initial guesses mean_counts = dict() mean_concentrations = dict() for phase, phase_group in compound_group.groupby("Solvent"): phase = phase.lower() mean_counts[phase] = phase_group["Area/Volume"].mean() mean_logd = numpy.log10(mean_counts['chx']/mean_counts['pbs']) mean_concentrations["pbs"] = self._pbs_conc(self.chx_volume, mean_logd, self.pbs_volume, stock_volume, self.stock_concentration) mean_concentrations["chx"] = self._chx_conc(self.chx_volume, mean_logd, self.pbs_volume, stock_volume, self.stock_concentration) avg_log_mrm_factor = numpy.log((mean_counts["chx"]+ mean_counts["pbs"])/(mean_concentrations["chx"] + mean_concentrations["pbs"])) logd_name = "LogD_{}".format(compound) fragmentation_param_name = "log_MRM_{}".format(compound) self.model[logd_name] = pymc.Uniform(logd_name, lower=-10, upper=10, value=mean_logd) self.model[fragmentation_param_name] = pymc.Uniform(fragmentation_param_name, lower=0.0, upper=numpy.log(1.e8/0.2), value=avg_log_mrm_factor) for (batch, repeat), repeat_group in compound_group.groupby(["Set", "Repeat"]): # One pipetting operation per repeat experiment # mu = self._mu_lognorm(stock_volume, self.model["Log Sigma volume"]) # tau = self._tau_lognorm(stock_volume, self.model["Log Sigma volume"]) # TODO remove artificial sigma constraint. (made up) mu_vol = self._mu_lognorm(stock_volume, pymc.log(0.1*stock_volume)) tau_vol = self._tau_lognorm(stock_volume, pymc.log(0.1*stock_volume)) vol_parameter_name = 'vol_{0}_{1}-{2}'.format(compound, batch, repeat) self.model[vol_parameter_name] = pymc.Lognormal(vol_parameter_name, mu=mu_vol, tau=tau_vol, value=stock_volume) for phase, phase_group in repeat_group.groupby("Solvent"): phase = phase.lower() if phase == "pbs": pbs_concentration = pymc.exp(self.model[fragmentation_param_name]) * self._pbs_conc(self.chx_volume, self.model[logd_name], self.pbs_volume, self.model[vol_parameter_name], self.stock_concentration) mu = self._mu_lognorm(pbs_concentration, pymc.log(pbs_concentration) + self.model["logsigma_ms_pbs"]) tau = self._tau_lognorm(pbs_concentration, pymc.log(pbs_concentration) + self.model["logsigma_ms_pbs"]) conc_name = "pbs_{0}_{1}-{2}".format(compound, batch, repeat) self.model[conc_name] = pymc.Lambda(conc_name, lambda x=pbs_concentration: x) elif phase == "chx": chx_concentration = pymc.exp(self.model[fragmentation_param_name]) * self._chx_conc(self.chx_volume, self.model[logd_name], self.pbs_volume, self.model[vol_parameter_name], self.stock_concentration) mu = self._mu_lognorm(chx_concentration, pymc.log(chx_concentration) + self.model["logsigma_ms_chx"]) tau = self._tau_lognorm(chx_concentration, pymc.log(chx_concentration) + self.model["logsigma_ms_chx"]) conc_name = "chx_{0}_{1}-{2}".format(compound, batch, repeat) self.model[conc_name] = pymc.Lambda(conc_name, lambda x=chx_concentration: x) else: raise ValueError("Unknown phase: {}".format(phase)) # likelihood of each observation for replicate, repl_group in repeat_group.groupby("Replicate"): replicate_parameter_name = 'C{0}_{1}_{2}-{3}_{4}'.format(compound, phase, batch, repeat, replicate) # Extract the observed concentration self.model[replicate_parameter_name] = pymc.Lognormal(replicate_parameter_name, mu=mu, tau=tau, observed=True, value=repl_group["Area/Volume"])