Ejemplo n.º 1
0
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype="continuous", ax=None):

    rhos = [0.1, 0.25, 0.4, 0.5, 0.75, 0.9]

    true_mis = np.zeros(len(rhos))
    mis = np.zeros((n_times, len(rhos)))

    for i, rho in enumerate(rhos):
        print("Rho: %1.1f" % (rho,))

        if vartype == "categorical":
            p, true_mi = _gen_categorical_joint_dist(rho, n_grid)
            metadata = {
                "x_1": {"dtype": "categorical", "values": [i for i in range(n_grid)]},
                "x_2": {"dtype": "categorical", "values": [i for i in range(n_grid)]},
            }
        elif vartype == "continuous":
            true_mi = -0.5 * log(1.0 - rho ** 2.0)
            metadata = {}
        else:
            raise ValueError("invalid vartype")

        for t in range(n_times):
            if vartype == "categorical":
                x = _sample_from_bivariate_discrete(p, n)
            elif vartype == "continuous":
                sigma = np.array([[1, rho], [rho, 1]])
                mu = np.zeros(2)
                x = np.random.multivariate_normal(mu, sigma, size=n)
            else:
                raise ValueError("invalid vartype")

            df = pd.DataFrame(x, columns=["x_1", "x_2"])

            engine = Engine(df, n_models=1, metadata=metadata, use_mp=False)
            engine.init_models()
            engine.run(n_iter)

            true_mis[i] = true_mi
            mis[t, i] = engine.mutual_information("x_1", "x_2", n_samples=500, normed=False)

    if ax is not None:
        ax.errorbar(rhos, y=np.mean(mis, axis=0), yerr=np.std(mis, axis=0), label="BaxCat")
        ax.plot(rhos, true_mis, label="True")

        ax.set_xlabel("rho")
        ax.set_ylabel("Mutual Information")
        ax.set_title(vartype)
        ax.legend(loc=0)
    else:
        return mis, true_mis
Ejemplo n.º 2
0
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype='continuous', ax=None):

    rhos = [.1, .25, .4, .5, .75, .9]

    true_mis = np.zeros(len(rhos))
    mis = np.zeros((
        n_times,
        len(rhos),
    ))

    for i, rho in enumerate(rhos):
        print('Rho: %1.1f' % (rho, ))

        if vartype == 'categorical':
            p, true_mi = _gen_categorical_joint_dist(rho, n_grid)
            metadata = {
                'x_1': {
                    'dtype': 'categorical',
                    'values': [i for i in range(n_grid)]
                },
                'x_2': {
                    'dtype': 'categorical',
                    'values': [i for i in range(n_grid)]
                }
            }
        elif vartype == 'continuous':
            true_mi = -.5 * log(1. - rho**2.)
            metadata = {}
        else:
            raise ValueError('invalid vartype')

        for t in range(n_times):
            if vartype == 'categorical':
                x = _sample_from_bivariate_discrete(p, n)
            elif vartype == 'continuous':
                sigma = np.array([[1, rho], [rho, 1]])
                mu = np.zeros(2)
                x = np.random.multivariate_normal(mu, sigma, size=n)
            else:
                raise ValueError('invalid vartype')

            df = pd.DataFrame(x, columns=['x_1', 'x_2'])

            engine = Engine(df, n_models=1, metadata=metadata, use_mp=False)
            engine.init_models()
            engine.run(n_iter)

            true_mis[i] = true_mi
            mis[t, i] = engine.mutual_information('x_1',
                                                  'x_2',
                                                  n_samples=500,
                                                  normed=False)

    if ax is not None:
        ax.errorbar(rhos,
                    y=np.mean(mis, axis=0),
                    yerr=np.std(mis, axis=0),
                    label='BaxCat')
        ax.plot(rhos, true_mis, label='True')

        ax.set_xlabel('rho')
        ax.set_ylabel('Mutual Information')
        ax.set_title(vartype)
        ax.legend(loc=0)
    else:
        return mis, true_mis
Ejemplo n.º 3
0
# We can view which columns are dependent on which other columns by plotting
# a n_cols by n_cols matrix where each cell is the dependence probability
# between two columns. Note that the dependence probability is simply the
# probability that a dependence exists, not the strength of the dependence.
engine.heatmap('dependence_probability', plot_kwargs={'figsize': (10, 10,)})
plt.show()

engine.heatmap('row_similarity', plot_kwargs={'figsize': (10, 10,)})
plt.show()

# The paint job is an important part of what makes a pine wood derby car fast,
# but does it matter for animals? We'll use the linfoot information to
# determine how predictive variables are of whether an animal is fast. Linfoot
# if basically the information-theoretic counterpart to correlation.
linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False)
linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False)

print('Linfoot(fast, lean) = %f' % (linfoot_lean,))
print('Linfoot(fast, stripes) = %f' % (linfoot_stripes,))

# We can also figure out which animals are more similar. Is a wolf more
# similar to a dalmatian or a rat.
sim_wolves = engine.row_similarity('chihuahua', 'wolf')
sim_rats = engine.row_similarity('chihuahua', 'rat')

print('Similarity between Chihuahuas and wolves is %f' % (sim_wolves,))
print('Similarity between Chihuahuas and rats is %f' % (sim_rats,))


# Which animals are outliers with respect to their being fast. We can find out
Ejemplo n.º 4
0
    10,
    10,
)})
plt.show()

engine.heatmap('row_similarity', plot_kwargs={'figsize': (
    10,
    10,
)})
plt.show()

# The paint job is an important part of what makes a pine wood derby car fast,
# but does it matter for animals? We'll use the linfoot information to
# determine how predictive variables are of whether an animal is fast. Linfoot
# if basically the information-theoretic counterpart to correlation.
linfoot_lean = engine.mutual_information('fast', 'lean', linfoot=False)
linfoot_stripes = engine.mutual_information('fast', 'stripes', linfoot=False)

print('Linfoot(fast, lean) = %f' % (linfoot_lean, ))
print('Linfoot(fast, stripes) = %f' % (linfoot_stripes, ))

# We can also figure out which animals are more similar. Is a wolf more
# similar to a dalmatian or a rat.
sim_wolves = engine.row_similarity('chihuahua', 'wolf')
sim_rats = engine.row_similarity('chihuahua', 'rat')

print('Similarity between Chihuahuas and wolves is %f' % (sim_wolves, ))
print('Similarity between Chihuahuas and rats is %f' % (sim_rats, ))

# Which animals are outliers with respect to their being fast. We can find out
# by calculating the surprisal (self infotmation).