def calculate_MI_bounded_discrete(X, Y, M_c, X_L, X_D): get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = su.determine_cluster_crp_logps(view_state) cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial n_clusters = len(cluster_crps) # get X values x_values = M_c['column_metadata'][X]['code_to_value'].values() # get Y values y_values = M_c['column_metadata'][Y]['code_to_value'].values() # get components models for each cluster for columns X and Y component_models_X = [0]*n_clusters component_models_Y = [0]*n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] MI = 0.0 for x in x_values: for y in y_values: # calculate marginal logs Pxy = numpy.zeros(n_clusters) # P(x,y), Joint distribution Px = numpy.zeros(n_clusters) # P(x) Py = numpy.zeros(n_clusters) # P(y) # get logp of x and y in each cluster. add cluster logp's for j in range(n_clusters): Px[j] = component_models_X[j].calc_element_predictive_logp(x) Py[j] = component_models_Y[j].calc_element_predictive_logp(y) Pxy[j] = Px[j] + Py[j] + cluster_logps[j] # \sum_c P(x|c)P(y|c)P(c), Joint distribution Px[j] += cluster_logps[j] # \sum_c P(x|c)P(c) Py[j] += cluster_logps[j] # \sum_c P(y|c)P(c) # sum over clusters Px = logsumexp(Px) Py = logsumexp(Py) Pxy = logsumexp(Pxy) MI += numpy.exp(Pxy)*(Pxy - (Px + Py)) # ignore MI < 0 if MI <= 0.0: MI = 0.0 return MI
def sample_from_view(M_c, X_L, X_D, get_next_seed): view_col = X_L['column_partition']['assignments'][0] view_col2 = X_L['column_partition']['assignments'][1] same_view = True if view_col2 != view_col: same_view = False view_state = X_L['view_state'][view_col] view_state2 = X_L['view_state'][view_col2] cluster_crps = numpy.exp(su.determine_cluster_crp_logps(view_state)) cluster_crps2 = numpy.exp(su.determine_cluster_crp_logps(view_state2)) assert (math.fabs(numpy.sum(cluster_crps) - 1) < .00000001) samples = numpy.zeros((n, 2)) cluster_idx1 = numpy.nonzero(numpy.random.multinomial(1, cluster_crps))[0][0] cluster_model1 = su.create_cluster_model_from_X_L(M_c, X_L, view_col, cluster_idx1) if same_view: cluster_idx2 = cluster_idx1 cluster_model2 = cluster_model1 else: cluster_idx2 = numpy.nonzero(numpy.random.multinomial( 1, cluster_crps2))[0][0] cluster_model2 = su.create_cluster_model_from_X_L( M_c, X_L, view_col2, cluster_idx2) component_model1 = cluster_model1[0] x = component_model1.get_draw(get_next_seed()) component_model2 = cluster_model2[1] y = component_model2.get_draw(get_next_seed()) return x, y
def sample_from_view(M_c, X_L, X_D, get_next_seed): view_col = X_L['column_partition']['assignments'][0] view_col2 = X_L['column_partition']['assignments'][1] same_view = True if view_col2 != view_col: same_view = False view_state = X_L['view_state'][view_col] view_state2 = X_L['view_state'][view_col2] cluster_crps = numpy.exp(su.determine_cluster_crp_logps(view_state)) cluster_crps2 = numpy.exp(su.determine_cluster_crp_logps(view_state2)) assert( math.fabs(numpy.sum(cluster_crps) - 1) < .00000001 ) samples = numpy.zeros((n,2)) cluster_idx1 = numpy.nonzero(numpy.random.multinomial(1, cluster_crps))[0][0] cluster_model1 = su.create_cluster_model_from_X_L(M_c, X_L, view_col, cluster_idx1) if same_view: cluster_idx2 = cluster_idx1 cluster_model2 = cluster_model1 else: cluster_idx2 = numpy.nonzero(numpy.random.multinomial(1, cluster_crps2))[0][0] cluster_model2 = su.create_cluster_model_from_X_L(M_c, X_L, view_col2, cluster_idx2) component_model1 = cluster_model1[0] x = component_model1.get_draw(get_next_seed()) component_model2 = cluster_model2[1] y = component_model2.get_draw(get_next_seed()) return x, y
def estimiate_MI_sample(X, Y, M_c, X_L, X_D, get_next_seed, n_samples=1000): get_view_index = lambda which_column: X_L['column_partition'][ 'assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = su.determine_cluster_crp_logps(view_state) cluster_crps = numpy.exp( cluster_logps) # get exp'ed values for multinomial n_clusters = len(cluster_crps) # get components models for each cluster for columns X and Y component_models_X = [0] * n_clusters component_models_Y = [0] * n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] MI = 0.0 # mutual information for _ in range(n_samples): # draw a cluster cluster_idx = numpy.nonzero(numpy.random.multinomial( 1, cluster_crps))[0][0] # get a sample from each cluster x = component_models_X[cluster_idx].get_draw(get_next_seed()) y = component_models_Y[cluster_idx].get_draw(get_next_seed()) # calculate marginal logs Pxy = numpy.zeros(n_clusters) # P(x,y), Joint distribution Px = numpy.zeros(n_clusters) # P(x) Py = numpy.zeros(n_clusters) # P(y) # get logp of x and y in each cluster. add cluster logp's for j in range(n_clusters): Px[j] = component_models_X[j].calc_element_predictive_logp(x) Py[j] = component_models_Y[j].calc_element_predictive_logp(y) Pxy[j] = Px[j] + Py[j] + cluster_logps[ j] # \sum_c P(x|c)P(y|c)P(c), Joint distribution Px[j] += cluster_logps[j] # \sum_c P(x|c)P(c) Py[j] += cluster_logps[j] # \sum_c P(y|c)P(c) # pdb.set_trace() # sum over clusters Px = logsumexp(Px) Py = logsumexp(Py) Pxy = logsumexp(Pxy) # add to MI MI += Pxy - (Px + Py) # average MI /= float(n_samples) # ignore MI < 0 if MI <= 0.0: MI = 0.0 return MI
def estimiate_MI_sample_hist(X, Y, M_c, X_L, X_D, get_next_seed, n_samples=10000): get_view_index = lambda which_column: X_L['column_partition'][ 'assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = su.determine_cluster_crp_logps(view_state) cluster_crps = numpy.exp(cluster_logps) n_clusters = len(cluster_crps) # get components models for each cluster for columns X and Y component_models_X = [0] * n_clusters component_models_Y = [0] * n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] MI = 0.0 samples = numpy.zeros((n_samples, 2), dtype=float) samples_x = numpy.zeros(n_samples, dtype=float) samples_y = numpy.zeros(n_samples, dtype=float) # draw the samples for i in range(n_samples): # draw a cluster cluster_idx = numpy.nonzero(numpy.random.multinomial( 1, cluster_crps))[0][0] x = component_models_X[cluster_idx].get_draw(get_next_seed()) y = component_models_Y[cluster_idx].get_draw(get_next_seed()) samples[i, 0] = x samples[i, 1] = y samples_x[i] = x samples_y[i] = y # calculate the number of bins and ranges N = float(n_samples) r, _ = corr(samples_x, samples_y) k = round(.5 + .5 * (1 + 4 * ((6 * N * r**2.) / (1 - r**2.))**.5)**.5) + 1 sigma_x = numpy.std(samples_x) mu_x = numpy.mean(samples_x) sigma_y = numpy.std(samples_y) mu_y = numpy.mean(samples_y) range_x = numpy.linspace(mu_x - 3. * sigma_x, mu_x + 3 * sigma_x, k) range_y = numpy.linspace(mu_y - 3. * sigma_y, mu_y + 3 * sigma_y, k) PXY, _, _ = numpy.histogram2d(samples[:, 0], samples[:, 1], bins=[range_x, range_y]) PX, _ = numpy.histogram(samples_x, bins=range_x) PY, _ = numpy.histogram(samples_y, bins=range_y) MI = 0 for i_x in range(PXY.shape[0]): for i_y in range(PXY.shape[1]): Pxy = PXY[i_x, i_y] Px = PX[i_x] Py = PY[i_y] if Pxy > 0.0 and Px > 0.0 and Py > 0.0: MI += (Pxy / N) * math.log(Pxy * N / (Px * Py)) # ignore MI < 0 if MI <= 0.0: MI = 0.0 return MI
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, _X_D): get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = numpy.array(su.determine_cluster_crp_logps(view_state)) n_clusters = len(cluster_logps) # get X values x_values = M_c['column_metadata'][X]['code_to_value'].values() # get Y values y_values = M_c['column_metadata'][Y]['code_to_value'].values() # get components models for each cluster for columns X and Y component_models_X = [0]*n_clusters component_models_Y = [0]*n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] def marginal_predictive_logps_by_cluster(value, component_models): return numpy.array([ component_models[j].calc_element_predictive_logp(value) + cluster_logps[j] for j in range(n_clusters)]) x_marginal_predictive_logps_by_cluster = \ [marginal_predictive_logps_by_cluster(x, component_models_X) for x in x_values] # \sum_c P(x|c)P(c) x_net_marginal_predictive_logps = \ [logsumexp(ps) for ps in x_marginal_predictive_logps_by_cluster] y_marginal_predictive_logps_by_cluster = \ [marginal_predictive_logps_by_cluster(y, component_models_Y) for y in y_values] # \sum_c P(y|c)P(c) y_net_marginal_predictive_logps = \ [logsumexp(ps) for ps in y_marginal_predictive_logps_by_cluster] MI = 0.0 for (i,x) in enumerate(x_values): x_marginals = x_marginal_predictive_logps_by_cluster[i] for (j,y) in enumerate(y_values): y_marginals = y_marginal_predictive_logps_by_cluster[j] # cluster prob is double-counted in sum of marginals joint_predictive_logp_by_cluster = \ x_marginals + y_marginals - cluster_logps # \sum_c P(x|c)P(y|c)P(c), Joint distribution joint_predictive_logp = logsumexp(joint_predictive_logp_by_cluster) MI += math.exp(joint_predictive_logp) * \ (joint_predictive_logp - \ (x_net_marginal_predictive_logps[i] + \ y_net_marginal_predictive_logps[j])) # ignore MI < 0 if MI <= 0.0: MI = 0.0 return MI
def estimate_MI_sample(X, Y, M_c, X_L, _X_D, get_next_seed, n_samples=1000): random_state = numpy.random.RandomState(get_next_seed()) get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = su.determine_cluster_crp_logps(view_state) cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial n_clusters = len(cluster_crps) # get components models for each cluster for columns X and Y component_models_X = [0]*n_clusters component_models_Y = [0]*n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] # MI = 0.0 # mutual information MI = numpy.zeros(n_samples) weights = numpy.zeros(n_samples) for i in range(n_samples): # draw a cluster cluster_idx = numpy.nonzero(random_state.multinomial(1, cluster_crps))[0][0] # get a sample from each cluster x = component_models_X[cluster_idx].get_draw(get_next_seed()) y = component_models_Y[cluster_idx].get_draw(get_next_seed()) # calculate marginal logs Pxy = numpy.zeros(n_clusters) # P(x,y), Joint distribution Px = numpy.zeros(n_clusters) # P(x) Py = numpy.zeros(n_clusters) # P(y) # get logp of x and y in each cluster. add cluster logp's for j in range(n_clusters): Px[j] = component_models_X[j].calc_element_predictive_logp(x) Py[j] = component_models_Y[j].calc_element_predictive_logp(y) Pxy[j] = Px[j] + Py[j] + cluster_logps[j] # \sum_c P(x|c)P(y|c)P(c), Joint distribution Px[j] += cluster_logps[j] # \sum_c P(x|c)P(c) Py[j] += cluster_logps[j] # \sum_c P(y|c)P(c) # pdb.set_trace() # sum over clusters Px = logsumexp(Px) Py = logsumexp(Py) Pxy = logsumexp(Pxy) # add to MI # MI += Pxy - (Px + Py) MI[i] = Pxy - (Px + Py) weights[i] = Pxy # do weighted average with underflow protection # MI /= float(n_samples) Z = logsumexp(weights) weights = numpy.exp(weights-Z) MI_ret = numpy.sum(MI*weights) # ignore MI < 0 if MI_ret <= 0.0: MI_ret = 0.0 return MI_ret
def estimiate_MI_sample_hist(X, Y, M_c, X_L, X_D, get_next_seed, n_samples=10000): get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = su.determine_cluster_crp_logps(view_state) cluster_crps = numpy.exp(cluster_logps) n_clusters = len(cluster_crps) # get components models for each cluster for columns X and Y component_models_X = [0]*n_clusters component_models_Y = [0]*n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] MI = 0.0 samples = numpy.zeros((n_samples,2), dtype=float) samples_x = numpy.zeros(n_samples, dtype=float) samples_y = numpy.zeros(n_samples, dtype=float) # draw the samples for i in range(n_samples): # draw a cluster cluster_idx = numpy.nonzero(numpy.random.multinomial(1, cluster_crps))[0][0] x = component_models_X[cluster_idx].get_draw(get_next_seed()) y = component_models_Y[cluster_idx].get_draw(get_next_seed()) samples[i,0] = x samples[i,1] = y samples_x[i] = x samples_y[i] = y # calculate the number of bins and ranges N = float(n_samples) r,_ = corr(samples_x, samples_y) k = round(.5+.5*(1+4*((6*N*r**2.)/(1-r**2.))**.5)**.5)+1 sigma_x = numpy.std(samples_x) mu_x = numpy.mean(samples_x) sigma_y = numpy.std(samples_y) mu_y = numpy.mean(samples_y) range_x = numpy.linspace(mu_x-3.*sigma_x,mu_x+3*sigma_x,k) range_y = numpy.linspace(mu_y-3.*sigma_y,mu_y+3*sigma_y,k) PXY, _, _ = numpy.histogram2d(samples[:,0], samples[:,1], bins=[range_x,range_y]) PX,_ = numpy.histogram(samples_x,bins=range_x) PY,_ = numpy.histogram(samples_y,bins=range_y) MI = 0 for i_x in range(PXY.shape[0]): for i_y in range(PXY.shape[1]): Pxy = PXY[i_x,i_y] Px = PX[i_x] Py = PY[i_y] if Pxy > 0.0 and Px > 0.0 and Py > 0.0: MI += (Pxy/N)*math.log(Pxy*N/(Px*Py)) # ignore MI < 0 if MI <= 0.0: MI = 0.0 return MI