def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype="continuous", ax=None): rhos = [0.1, 0.25, 0.4, 0.5, 0.75, 0.9] true_mis = np.zeros(len(rhos)) mis = np.zeros((n_times, len(rhos))) for i, rho in enumerate(rhos): print("Rho: %1.1f" % (rho,)) if vartype == "categorical": p, true_mi = _gen_categorical_joint_dist(rho, n_grid) metadata = { "x_1": {"dtype": "categorical", "values": [i for i in range(n_grid)]}, "x_2": {"dtype": "categorical", "values": [i for i in range(n_grid)]}, } elif vartype == "continuous": true_mi = -0.5 * log(1.0 - rho ** 2.0) metadata = {} else: raise ValueError("invalid vartype") for t in range(n_times): if vartype == "categorical": x = _sample_from_bivariate_discrete(p, n) elif vartype == "continuous": sigma = np.array([[1, rho], [rho, 1]]) mu = np.zeros(2) x = np.random.multivariate_normal(mu, sigma, size=n) else: raise ValueError("invalid vartype") df = pd.DataFrame(x, columns=["x_1", "x_2"]) engine = Engine(df, n_models=1, metadata=metadata, use_mp=False) engine.init_models() engine.run(n_iter) true_mis[i] = true_mi mis[t, i] = engine.mutual_information("x_1", "x_2", n_samples=500, normed=False) if ax is not None: ax.errorbar(rhos, y=np.mean(mis, axis=0), yerr=np.std(mis, axis=0), label="BaxCat") ax.plot(rhos, true_mis, label="True") ax.set_xlabel("rho") ax.set_ylabel("Mutual Information") ax.set_title(vartype) ax.legend(loc=0) else: return mis, true_mis
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype='continuous', ax=None): rhos = [.1, .25, .4, .5, .75, .9] true_mis = np.zeros(len(rhos)) mis = np.zeros(( n_times, len(rhos), )) for i, rho in enumerate(rhos): print('Rho: %1.1f' % (rho, )) if vartype == 'categorical': p, true_mi = _gen_categorical_joint_dist(rho, n_grid) metadata = { 'x_1': { 'dtype': 'categorical', 'values': [i for i in range(n_grid)] }, 'x_2': { 'dtype': 'categorical', 'values': [i for i in range(n_grid)] } } elif vartype == 'continuous': true_mi = -.5 * log(1. - rho**2.) metadata = {} else: raise ValueError('invalid vartype') for t in range(n_times): if vartype == 'categorical': x = _sample_from_bivariate_discrete(p, n) elif vartype == 'continuous': sigma = np.array([[1, rho], [rho, 1]]) mu = np.zeros(2) x = np.random.multivariate_normal(mu, sigma, size=n) else: raise ValueError('invalid vartype') df = pd.DataFrame(x, columns=['x_1', 'x_2']) engine = Engine(df, n_models=1, metadata=metadata, use_mp=False) engine.init_models() engine.run(n_iter) true_mis[i] = true_mi mis[t, i] = engine.mutual_information('x_1', 'x_2', n_samples=500, normed=False) if ax is not None: ax.errorbar(rhos, y=np.mean(mis, axis=0), yerr=np.std(mis, axis=0), label='BaxCat') ax.plot(rhos, true_mis, label='True') ax.set_xlabel('rho') ax.set_ylabel('Mutual Information') ax.set_title(vartype) ax.legend(loc=0) else: return mis, true_mis
# We can view which columns are dependent on which other columns by plotting # a n_cols by n_cols matrix where each cell is the dependence probability # between two columns. Note that the dependence probability is simply the # probability that a dependence exists, not the strength of the dependence. engine.heatmap('dependence_probability', plot_kwargs={'figsize': (10, 10,)}) plt.show() engine.heatmap('row_similarity', plot_kwargs={'figsize': (10, 10,)}) plt.show() # The paint job is an important part of what makes a pine wood derby car fast, # but does it matter for animals? We'll use the linfoot information to # determine how predictive variables are of whether an animal is fast. Linfoot # if basically the information-theoretic counterpart to correlation. linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False) linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False) print('Linfoot(fast, lean) = %f' % (linfoot_lean,)) print('Linfoot(fast, stripes) = %f' % (linfoot_stripes,)) # We can also figure out which animals are more similar. Is a wolf more # similar to a dalmatian or a rat. sim_wolves = engine.row_similarity('chihuahua', 'wolf') sim_rats = engine.row_similarity('chihuahua', 'rat') print('Similarity between Chihuahuas and wolves is %f' % (sim_wolves,)) print('Similarity between Chihuahuas and rats is %f' % (sim_rats,)) # Which animals are outliers with respect to their being fast. We can find out
10, 10, )}) plt.show() engine.heatmap('row_similarity', plot_kwargs={'figsize': ( 10, 10, )}) plt.show() # The paint job is an important part of what makes a pine wood derby car fast, # but does it matter for animals? We'll use the linfoot information to # determine how predictive variables are of whether an animal is fast. Linfoot # if basically the information-theoretic counterpart to correlation. linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False) linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False) print('Linfoot(fast, lean) = %f' % (linfoot_lean, )) print('Linfoot(fast, stripes) = %f' % (linfoot_stripes, )) # We can also figure out which animals are more similar. Is a wolf more # similar to a dalmatian or a rat. sim_wolves = engine.row_similarity('chihuahua', 'wolf') sim_rats = engine.row_similarity('chihuahua', 'rat') print('Similarity between Chihuahuas and wolves is %f' % (sim_wolves, )) print('Similarity between Chihuahuas and rats is %f' % (sim_rats, )) # Which animals are outliers with respect to their being fast. We can find out # by calculating the surprisal (self infotmation).