/
models.py
131 lines (105 loc) · 4.36 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from pymc3.distributions import StudentT, Exponential, Uniform, HalfCauchy
from pymc3 import Model
from pymc3.variational import advi, sample_vp
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
class BEST(object):
"""BEST Model, based on Kruschke (2013).
Parameters
----------
data : pandas DataFrame
A pandas dataframe which has the following data:
- Each row is one replicate measurement.
- There is a column that records the treatment name.
- There is a column that records the measured value for that replicate.
sample_col : str
The name of the column containing sample names.
output_col : str
The name of the column containing values to estimate.
baseline_name : str
The name of the "control" or "baseline".
Output
------
model : PyMC3 model
Returns the BEST model containing
"""
def __init__(self, data, sample_col, output_col, baseline_name):
super(BEST, self).__init__()
self.data = data
self.sample_col = sample_col
self.output_col = output_col
self.baseline_name = baseline_name
self.trace = None
self._convert_to_indices()
def _convert_to_indices(self):
sample_names = dict()
for i, name in enumerate(
list(np.unique(self.data[self.sample_col].values))):
print('Sample name {0} has the index {1}'.format(name, i))
sample_names[name] = i
self.data['indices'] = self.data[self.sample_col].apply(
lambda x: sample_names[x])
print(self.data)
def fit(self, n_steps=30000):
"""
Creates a Bayesian Estimation model for replicate measurements of
treatment(s) vs. control.
Parameters
----------
n_steps : int
The number of steps to run ADVI.
"""
sample_names = set(self.data[self.sample_col].values)
sample_names.remove(self.baseline_name)
with Model() as model:
# Hyperpriors
upper = Exponential('upper', lam=0.05)
nu = Exponential('nu_minus_one', 1/29.) + 1
# "fold", which is the estimated fold change.
fold = Uniform('fold', lower=1E-10, upper=upper,
shape=len(sample_names))
# Assume that data have heteroskedastic (i.e. variable) error but
# are drawn from the same HalfCauchy distribution.
sigma = HalfCauchy('sigma', beta=1, shape=len(sample_names))
# Model prediction
mu = fold[self.data['indices']]
sig = sigma[self.data['indices']]
# Data likelihood
like = StudentT('like', nu=nu, mu=mu, sd=sig**-2,
observed=self.data[self.output_col])
self.model = model
with model:
params = advi(n=n_steps)
trace = sample_vp(params, draws=2000)
self.trace = trace
def plot_posterior(self):
"""
Plots a swarm plot of the data overlaid on top of the 95% HPD and IQR
of the posterior distribution.
"""
# Make summary plot #
fig = plt.figure()
ax = fig.add_subplot(111)
# 1. Get the lower error and upper errorbars for 95% HPD and IQR.
lower, lower_q, upper_q, upper = np.percentile(self.trace['fold'],
[2.5, 25, 75, 97.5],
axis=0)
summary_stats = pd.DataFrame()
summary_stats['mean'] = self.trace['fold'].mean(axis=0)
err_low = summary_stats['mean'] - lower
err_high = upper - summary_stats['mean']
iqr_low = summary_stats['mean'] - lower_q
iqr_high = upper_q - summary_stats['mean']
# 2. Plot the swarmplot and errorbars.
summary_stats['mean'].plot(rot=90, ls='', ax=ax,
yerr=[err_low, err_high])
summary_stats['mean'].plot(rot=90, ls='', ax=ax,
yerr=[iqr_low, iqr_high],
elinewidth=4, color='red')
sns.swarmplot(data=self.data, x=self.sample_col, y=self.output_col,
orient='v', ax=ax, alpha=0.5)
plt.xticks(rotation='vertical')
plt.ylabel(self.output_col)
return fig