def get_city_data(): """Helper to get city data""" data = pd.read_csv(pm.get_data_file('pymc3.examples', 'data/srrs2.dat')) cty_data = pd.read_csv(pm.get_data_file('pymc3.examples', 'data/cty.dat')) data = data[data.state == 'MN'] data['fips'] = data.stfips * 1000 + data.cntyfips cty_data['fips'] = cty_data.stfips * 1000 + cty_data.ctfips data['lradon'] = np.log(np.where(data.activity == 0, .1, data.activity)) data = data.merge(cty_data, 'inner', on='fips') unique = data[['fips']].drop_duplicates() unique['group'] = np.arange(len(unique)) unique.set_index('fips') return data.merge(unique, 'inner', on='fips')
def build_model(self): wells = pm.get_data_file('pymc3.examples', 'data/wells.dat') data = pd.read_csv(wells, delimiter=u' ', index_col=u'id', dtype={u'switch': np.int8}) data.dist /= 100 data.educ /= 4 col = data.columns P = data[col[1:]] P -= P.mean() P['1'] = 1 with pm.Model() as model: effects = pm.Normal('effects', mu=0, tau=100. ** -2, shape=len(P.columns)) p = pm.sigmoid(pm.dot(np.array(P), effects)) pm.Bernoulli('s', p, observed=np.array(data.switch)) return model
def build_model(self): wells = pm.get_data_file('pymc3.examples', 'data/wells.dat') data = pd.read_csv(wells, delimiter=u' ', index_col=u'id', dtype={u'switch': np.int8}) data.dist /= 100 data.educ /= 4 col = data.columns P = data[col[1:]] P -= P.mean() P['1'] = 1 with pm.Model() as model: effects = pm.Normal('effects', mu=0, tau=100.**-2, shape=len(P.columns)) p = tt.nnet.sigmoid(tt.dot(np.array(P), effects)) pm.Bernoulli('s', p, observed=np.array(data.switch)) return model
# <codecell> import pandas as pd from pylab import * from pymc3 import StudentT, Model, NUTS, Normal, find_MAP, trace, get_data_file from pymc3.distributions.timeseries import GaussianRandomWalk # <markdowncell> # Data # ---- # <codecell> data = pd.read_csv(get_data_file("pymc3.examples", "data/pancreatitis.csv")) countries = ["CYP", "DNK", "ESP", "FIN", "GBR", "ISL"] data = data[data.area.isin(countries)] age = data["age"] = np.array(data.age_start + data.age_end) / 2 rate = data.value = data.value * 1000 group, countries = pd.factorize(data.area, order=countries) ncountries = len(countries) # <codecell> for i, country in enumerate(countries): subplot(2, 3, i + 1) title(country)
import pymc3 as pm import pandas as pd from numpy.ma import masked_values # Import data, filling missing values with sentinels (-999) test_scores = pd.read_csv(pm.get_data_file( 'pymc3.examples', 'data/test_scores.csv')).fillna(-999) # Extract variables: test score, gender, number of siblings, previous disability, age, # mother with HS education or better, hearing loss identified by 3 months # of age (score, male, siblings, disability, age, mother_hs, early_ident) = test_scores[['score', 'male', 'siblings', 'prev_disab', 'age_test', 'mother_hs', 'early_ident']].astype(float).values.T with pm.Model() as model: # Impute missing values sib_mean = pm.Exponential('sib_mean', 1) siblings_imp = pm.Poisson('siblings_imp', sib_mean, observed=masked_values(siblings, value=-999)) p_disab = pm.Beta('p_disab', 1, 1) disability_imp = pm.Bernoulli( 'disability_imp', p_disab, observed=masked_values(disability, value=-999)) p_mother = pm.Beta('p_mother', 1, 1) mother_imp = pm.Bernoulli('mother_imp', p_mother, observed=masked_values(mother_hs, value=-999))
# <codecell> import pandas as pd from pylab import * from pymc3 import StudentT, Model, NUTS, Normal, find_MAP, trace, get_data_file from pymc3.distributions.timeseries import GaussianRandomWalk # <markdowncell> # Data # ---- # <codecell> data = pd.read_csv(get_data_file('pymc3.examples', 'data/pancreatitis.csv')) countries = ['CYP', 'DNK', 'ESP', 'FIN', 'GBR', 'ISL'] data = data[data.area.isin(countries)] age = data['age'] = np.array(data.age_start + data.age_end) / 2 rate = data.value = data.value * 1000 group, countries = pd.factorize(data.area, order=countries) ncountries = len(countries) # <codecell> for i, country in enumerate(countries): subplot(2, 3, i + 1) title(country)
#https://github.com/pymc-devs/pymc3/blob/master/pymc3/examples/stochastic_volatility.py import numpy as np import matplotlib.pyplot as plt import pymc3 import pymc3.distributions.timeseries as ts import pandas as pd import scipy #fname = 'https://github.com/pymc-devs/pymc3/blob/master/pymc3/examples/data/SP500.csv' #returns = pd.read_csv(fname) #returns = pd.read_csv('SP500.csv') #print(len(returns)) n = 400 returns = np.genfromtxt(pymc3.get_data_file('pymc3.examples', "data/SP500.csv"))[-n:] returns[:5] plt.plot(returns) plt.ylabel('daily returns in %') with pymc3.Model() as sp500_model: nu = pymc3.Exponential('nu', 1. / 10, testval=5.) sigma = pymc3.Exponential('sigma', 1. / .02, testval=.1) s = ts.GaussianRandomWalk('s', sigma**-2, shape=len(returns)) volatility_process = pymc3.Deterministic('volatility_process', pymc3.exp(-2 * s)) r = pymc3.StudentT('r', nu, lam=1 / volatility_process, observed=returns) with sp500_model: print('optimizing...')
import numpy as np import pandas as pd import matplotlib.pyplot as plt from pymc3 import HalfCauchy, Model, Normal, get_data_file, sample from pymc3.distributions.timeseries import GaussianRandomWalk data = pd.read_csv(get_data_file('pymc3.examples', 'data/pancreatitis.csv')) countries = ['CYP', 'DNK', 'ESP', 'FIN', 'GBR', 'ISL'] data = data[data.area.isin(countries)] age = data['age'] = np.array(data.age_start + data.age_end) / 2 rate = data.value = data.value * 1000 group, countries = pd.factorize(data.area, order=countries) ncountries = len(countries) for i, country in enumerate(countries): plt.subplot(2, 3, i + 1) plt.title(country) d = data[data.area == country] plt.plot(d.age, d.value, '.') plt.ylim(0, rate.max()) nknots = 10 knots = np.linspace(data.age_start.min(), data.age_end.max(), nknots)
#https://github.com/pymc-devs/pymc3/blob/master/pymc3/examples/stochastic_volatility.py import numpy as np import matplotlib.pyplot as plt import pymc3 import pymc3.distributions.timeseries as ts import pandas as pd import scipy #fname = 'https://github.com/pymc-devs/pymc3/blob/master/pymc3/examples/data/SP500.csv' #returns = pd.read_csv(fname) #returns = pd.read_csv('SP500.csv') #print(len(returns)) n = 400 returns = np.genfromtxt(pymc3.get_data_file('pymc3.examples', "data/SP500.csv"))[-n:] returns[:5] plt.plot(returns) plt.ylabel('daily returns in %'); with pymc3.Model() as sp500_model: nu = pymc3.Exponential('nu', 1./10, testval=5.) sigma = pymc3.Exponential('sigma', 1./.02, testval=.1) s = ts.GaussianRandomWalk('s', sigma**-2, shape=len(returns)) volatility_process = pymc3.Deterministic('volatility_process', pymc3.exp(-2*s)) r = pymc3.StudentT('r', nu, lam=1/volatility_process, observed=returns) with sp500_model: print 'optimizing...'