def run(n_models=10, n_iter=200, iter_step=10, n_needles=2, n_distractors=8, n_rows=100, pairtype=None, pair_kws=None): needle_idxs = [(2*i, 2*i+1,) for i in range(n_needles)] needle_cols = list(range(n_needles*2)) distractor_cols = list(range(n_needles*2, n_needles*2+n_distractors)) combs = list(it.product(needle_cols, distractor_cols)) distractor_idxs = random.sample(combs, min(len(combs), 32)) df = _gen_data(n_needles, n_distractors, n_rows, pairtype, pair_kws) engine = Engine(df, n_models=n_models) engine.init_models() # for model in engine._models: # # XXX: emulates the log grid expected alpha # # e.g. mean(exp(linspace(log(1/n_rows), log(rows)))) # # model['state_alpha'] = .5*(n_needles*2. + n_distractors) # model['state_alpha'] = 100. # no column_alpha transition tlist = [b'row_assignment', b'column_assignment', b'row_alpha', b'column_hypers'] n_steps = int(n_iter/iter_step) needle_dps = np.zeros((n_needles, n_steps+1,)) distractor_dps = np.zeros((len(distractor_idxs), n_steps+1,)) for i in range(n_steps+1): engine.run(iter_step, trans_kwargs={'transition_list': tlist}) # engine.run(iter_step) for nidx, (a, b) in enumerate(needle_idxs): a = df.columns[a] b = df.columns[b] needle_dps[nidx, i] = engine.dependence_probability(a, b) for didx, (a, b) in enumerate(distractor_idxs): a = df.columns[a] b = df.columns[b] distractor_dps[didx, i] = engine.dependence_probability(a, b) iter_count = np.cumsum([1]+[iter_step]*n_steps) for y in distractor_dps: plt.plot(iter_count, y, color='gray', alpha=.3) for y in needle_dps: plt.plot(iter_count, y, color='crimson') # plt.gca().set_xscale('log') plt.ylim([-.05, 1.05]) plt.xlim([1, iter_count[-1]]) plt.show() engine.heatmap('dependence_probability') plt.show()
# we take several models. print('Initializing 32 models...') engine.init_models() print('Running models for 200 iterations...') engine.run(200, checkpoint=5) # To check whether inference has converges, we plot the log score for each # model as a function of time and make sure they all have leveled out. engine.convergence_plot() plt.show() # We can view which columns are dependent on which other columns by plotting # a n_cols by n_cols matrix where each cell is the dependence probability # between two columns. Note that the dependence probability is simply the # probability that a dependence exists, not the strength of the dependence. engine.heatmap('dependence_probability', plot_kwargs={'figsize': (10, 10,)}) plt.show() engine.heatmap('row_similarity', plot_kwargs={'figsize': (10, 10,)}) plt.show() # The paint job is an important part of what makes a pine wood derby car fast, # but does it matter for animals? We'll use the linfoot information to # determine how predictive variables are of whether an animal is fast. Linfoot # if basically the information-theoretic counterpart to correlation. linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False) linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False) print('Linfoot(fast, lean) = %f' % (linfoot_lean,)) print('Linfoot(fast, stripes) = %f' % (linfoot_stripes,))
print('Initializing 32 models...') engine.init_models() print('Running models for 200 iterations...') engine.run(200, checkpoint=5) # To check whether inference has converges, we plot the log score for each # model as a function of time and make sure they all have leveled out. engine.convergence_plot() plt.show() # We can view which columns are dependent on which other columns by plotting # a n_cols by n_cols matrix where each cell is the dependence probability # between two columns. Note that the dependence probability is simply the # probability that a dependence exists, not the strength of the dependence. engine.heatmap('dependence_probability', plot_kwargs={'figsize': ( 10, 10, )}) plt.show() engine.heatmap('row_similarity', plot_kwargs={'figsize': ( 10, 10, )}) plt.show() # The paint job is an important part of what makes a pine wood derby car fast, # but does it matter for animals? We'll use the linfoot information to # determine how predictive variables are of whether an animal is fast. Linfoot # if basically the information-theoretic counterpart to correlation. linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False) linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False)
std = 1. for i in range(n_rows): a = np.random.randint(3) b = np.random.randint(3) mu = mus[a, b] x = np.random.randn()*std + mu data.append([x, a, b]) return pd.DataFrame(data) n_rows = 100 n_cols = 32 da = gen_phenotype_data(n_rows) db = pd.DataFrame(np.random.randint(3, size=(n_rows, n_cols,))) df = pd.concat([da, db], axis=1) df.columns = ['T', 'A', 'B'] + ['x_%d' % i for i in range(n_cols)] engine = Engine(df, n_models=32) engine.init_models() engine.run(100) for col in df.columns: if col != 'T': print("1/H(%s|T) = %f" % (col, 1/engine.conditional_entropy(col, 'T'))) engine.heatmap('dependence_probability') plt.show()
x = np.random.randn() * std + mu data.append([x, a, b]) return pd.DataFrame(data) n_rows = 100 n_cols = 32 da = gen_phenotype_data(n_rows) db = pd.DataFrame(np.random.randint(3, size=( n_rows, n_cols, ))) df = pd.concat([da, db], axis=1) df.columns = ['T', 'A', 'B'] + ['x_%d' % i for i in range(n_cols)] engine = Engine(df, n_models=32) engine.init_models() engine.run(100) for col in df.columns: if col != 'T': print("1/H(%s|T) = %f" % (col, 1 / engine.conditional_entropy(col, 'T'))) engine.heatmap('dependence_probability') plt.show()