def calculate_MI_bounded_discrete(X, Y, M_c, X_L, X_D): get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = su.determine_cluster_crp_logps(view_state) cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial n_clusters = len(cluster_crps) # get X values x_values = M_c['column_metadata'][X]['code_to_value'].values() # get Y values y_values = M_c['column_metadata'][Y]['code_to_value'].values() # get components models for each cluster for columns X and Y component_models_X = [0]*n_clusters component_models_Y = [0]*n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] MI = 0.0 for x in x_values: for y in y_values: # calculate marginal logs Pxy = numpy.zeros(n_clusters) # P(x,y), Joint distribution Px = numpy.zeros(n_clusters) # P(x) Py = numpy.zeros(n_clusters) # P(y) # get logp of x and y in each cluster. add cluster logp's for j in range(n_clusters): Px[j] = component_models_X[j].calc_element_predictive_logp(x) Py[j] = component_models_Y[j].calc_element_predictive_logp(y) Pxy[j] = Px[j] + Py[j] + cluster_logps[j] # \sum_c P(x|c)P(y|c)P(c), Joint distribution Px[j] += cluster_logps[j] # \sum_c P(x|c)P(c) Py[j] += cluster_logps[j] # \sum_c P(y|c)P(c) # sum over clusters Px = logsumexp(Px) Py = logsumexp(Py) Pxy = logsumexp(Pxy) MI += numpy.exp(Pxy)*(Pxy - (Px + Py)) # ignore MI < 0 if MI <= 0.0: MI = 0.0 return MI
def simple_predictive_probability_unobserved(M_c, X_L, X_D, Y, query_row, query_columns, elements): n_queries = len(query_columns) answer = numpy.zeros(n_queries) # answers = numpy.array([]) for n in range(n_queries): query_column = query_columns[n] x = elements[n] # get the view to which this column is assigned view_idx = X_L['column_partition']['assignments'][query_column] # get the logps for all the clusters (plus a new one) in this view cluster_logps = determine_cluster_logps(M_c, X_L, X_D, Y, query_row, view_idx) answers_n = numpy.zeros(len(cluster_logps)) # cluster_logps should logsumexp to log(1) assert (numpy.abs(logsumexp(cluster_logps)) < .0000001) # enumerate over the clusters for cluster_idx in range(len(cluster_logps)): # get the cluster model for this cluster cluster_model = create_cluster_model_from_X_L( M_c, X_L, view_idx, cluster_idx) # get the specific cluster model for this column component_model = cluster_model[query_column] # construct draw conataints draw_constraints = get_draw_constraints(X_L, X_D, Y, query_row, query_column) # return the PDF value (exp) p_x = component_model.calc_element_predictive_logp_constrained( x, draw_constraints) answers_n[cluster_idx] = p_x + cluster_logps[cluster_idx] answer[n] = logsumexp(answers_n) return answer
def simple_predictive_probability_unobserved( M_c, X_L, X_D, Y, query_row, query_columns, elements): n_queries = len(query_columns) answer = numpy.zeros(n_queries) for n in range(n_queries): query_column = query_columns[n] x = elements[n] # Get the view to which this column is assigned. view_idx = X_L['column_partition']['assignments'][query_column] # Get the logps for all the clusters (plus a new one) in this view. cluster_logps = determine_cluster_logps( M_c, X_L, X_D, Y, query_row, view_idx) answers_n = numpy.zeros(len(cluster_logps)) # `cluster_logps` should logsumexp to log(1). assert numpy.abs(logsumexp(cluster_logps)) < .0000001 # Enumerate over the clusters for cluster_idx in range(len(cluster_logps)): # Get the cluster model for this cluster. cluster_model = create_cluster_model_from_X_L( M_c, X_L, view_idx, cluster_idx) # Get the specific cluster model for this column. component_model = cluster_model[query_column] # Construct draw conataints. draw_constraints = get_draw_constraints( X_L, X_D, Y, query_row, query_column) # Return the PDF value (exp). p_x = component_model.calc_element_predictive_logp_constrained( x, draw_constraints) answers_n[cluster_idx] = p_x + cluster_logps[cluster_idx] answer[n] = logsumexp(answers_n) return answer
def determine_cluster_logps(M_c, X_L, X_D, Y, query_row, view_idx): view_state_i = X_L['view_state'][view_idx] cluster_crp_logps = determine_cluster_crp_logps(view_state_i) cluster_crp_logps = numpy.array(cluster_crp_logps) cluster_data_logps = determine_cluster_data_logps(M_c, X_L, X_D, Y, query_row, view_idx) cluster_data_logps = numpy.array(cluster_data_logps) # We need to compute the vector of probabilities log[P(Z=j|Y)] where `Z` # is the row cluster, `Y` are the constraints, and `j` iterates from 1 to # the number of clusters (plus 1 for a new cluster) in the row partition of # `view_idx`. Mathematically: # log{P(Z=j|Y)} = log{P(Z=j)P(Y|Z=j) / P(Y) } # = log{P(Z=j)} + log{P(Y|Z=j)} - log{sum_k(P(Z=k)P(Y|Z=k))} # = cluster_crp_logps + cluster_data_logps - BAZ # The final term BAZ is computed by: # log{sum_k(P(Z=k)P(Y|Z=k))} # = log{sum_k(exp(log{P(Z=k)}+log{P(Y|Z=k)})) # = logsumexp(cluster_crp_logps + cluster_data_logps) cluster_logps = cluster_crp_logps + cluster_data_logps - \ logsumexp(cluster_crp_logps + cluster_data_logps) return cluster_logps
def determine_cluster_logps(M_c, X_L, X_D, Y, query_row, view_idx): view_state_i = X_L['view_state'][view_idx] cluster_crp_logps = determine_cluster_crp_logps(view_state_i) cluster_crp_logps = numpy.array(cluster_crp_logps) cluster_data_logps = determine_cluster_data_logps( M_c, X_L, X_D, Y, query_row, view_idx) cluster_data_logps = numpy.array(cluster_data_logps) # We need to compute the vector of probabilities log[P(Z=j|Y)] where `Z` # is the row cluster, `Y` are the constraints, and `j` iterates from 1 to # the number of clusters (plus 1 for a new cluster) in the row partition of # `view_idx`. Mathematically: # log{P(Z=j|Y)} = log{P(Z=j)P(Y|Z=j) / P(Y) } # = log{P(Z=j)} + log{P(Y|Z=j)} - log{sum_k(P(Z=k)P(Y|Z=k))} # = cluster_crp_logps + cluster_data_logps - BAZ # The final term BAZ is computed by: # log{sum_k(P(Z=k)P(Y|Z=k))} # = log{sum_k(exp(log{P(Z=k)}+log{P(Y|Z=k)})) # = logsumexp(cluster_crp_logps + cluster_data_logps) cluster_logps = cluster_crp_logps + cluster_data_logps - \ logsumexp(cluster_crp_logps + cluster_data_logps) return cluster_logps
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, _X_D): get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = numpy.array(su.determine_cluster_crp_logps(view_state)) n_clusters = len(cluster_logps) # get X values x_values = M_c['column_metadata'][X]['code_to_value'].values() # get Y values y_values = M_c['column_metadata'][Y]['code_to_value'].values() # get components models for each cluster for columns X and Y component_models_X = [0]*n_clusters component_models_Y = [0]*n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] def marginal_predictive_logps_by_cluster(value, component_models): return numpy.array([ component_models[j].calc_element_predictive_logp(value) + cluster_logps[j] for j in range(n_clusters)]) x_marginal_predictive_logps_by_cluster = \ [marginal_predictive_logps_by_cluster(x, component_models_X) for x in x_values] # \sum_c P(x|c)P(c) x_net_marginal_predictive_logps = \ [logsumexp(ps) for ps in x_marginal_predictive_logps_by_cluster] y_marginal_predictive_logps_by_cluster = \ [marginal_predictive_logps_by_cluster(y, component_models_Y) for y in y_values] # \sum_c P(y|c)P(c) y_net_marginal_predictive_logps = \ [logsumexp(ps) for ps in y_marginal_predictive_logps_by_cluster] MI = 0.0 for (i,x) in enumerate(x_values): x_marginals = x_marginal_predictive_logps_by_cluster[i] for (j,y) in enumerate(y_values): y_marginals = y_marginal_predictive_logps_by_cluster[j] # cluster prob is double-counted in sum of marginals joint_predictive_logp_by_cluster = \ x_marginals + y_marginals - cluster_logps # \sum_c P(x|c)P(y|c)P(c), Joint distribution joint_predictive_logp = logsumexp(joint_predictive_logp_by_cluster) MI += math.exp(joint_predictive_logp) * \ (joint_predictive_logp - \ (x_net_marginal_predictive_logps[i] + \ y_net_marginal_predictive_logps[j])) # ignore MI < 0 if MI <= 0.0: MI = 0.0 return MI
def estimate_MI_sample(X, Y, M_c, X_L, _X_D, get_next_seed, n_samples=1000): random_state = numpy.random.RandomState(get_next_seed()) get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = su.determine_cluster_crp_logps(view_state) cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial n_clusters = len(cluster_crps) # get components models for each cluster for columns X and Y component_models_X = [0]*n_clusters component_models_Y = [0]*n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] # MI = 0.0 # mutual information MI = numpy.zeros(n_samples) weights = numpy.zeros(n_samples) for i in range(n_samples): # draw a cluster cluster_idx = numpy.nonzero(random_state.multinomial(1, cluster_crps))[0][0] # get a sample from each cluster x = component_models_X[cluster_idx].get_draw(get_next_seed()) y = component_models_Y[cluster_idx].get_draw(get_next_seed()) # calculate marginal logs Pxy = numpy.zeros(n_clusters) # P(x,y), Joint distribution Px = numpy.zeros(n_clusters) # P(x) Py = numpy.zeros(n_clusters) # P(y) # get logp of x and y in each cluster. add cluster logp's for j in range(n_clusters): Px[j] = component_models_X[j].calc_element_predictive_logp(x) Py[j] = component_models_Y[j].calc_element_predictive_logp(y) Pxy[j] = Px[j] + Py[j] + cluster_logps[j] # \sum_c P(x|c)P(y|c)P(c), Joint distribution Px[j] += cluster_logps[j] # \sum_c P(x|c)P(c) Py[j] += cluster_logps[j] # \sum_c P(y|c)P(c) # pdb.set_trace() # sum over clusters Px = logsumexp(Px) Py = logsumexp(Py) Pxy = logsumexp(Pxy) # add to MI # MI += Pxy - (Px + Py) MI[i] = Pxy - (Px + Py) weights[i] = Pxy # do weighted average with underflow protection # MI /= float(n_samples) Z = logsumexp(weights) weights = numpy.exp(weights-Z) MI_ret = numpy.sum(MI*weights) # ignore MI < 0 if MI_ret <= 0.0: MI_ret = 0.0 return MI_ret
def test_logsumexp(): inf = float('inf') nan = float('nan') with pytest.raises(OverflowError): math.log(sum(map(math.exp, range(1000)))) assert relerr(999.4586751453871, gu.logsumexp(range(1000))) < 1e-15 assert gu.logsumexp([]) == -inf assert gu.logsumexp([-1000.]) == -1000. assert gu.logsumexp([-1000., -1000.]) == -1000. + math.log(2.) assert relerr(math.log(2.), gu.logsumexp([0., 0.])) < 1e-15 assert gu.logsumexp([-inf, 1]) == 1 assert gu.logsumexp([-inf, -inf]) == -inf assert gu.logsumexp([+inf, +inf]) == +inf assert math.isnan(gu.logsumexp([-inf, +inf])) assert math.isnan(gu.logsumexp([nan, inf])) assert math.isnan(gu.logsumexp([nan, -3]))
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, X_D): get_view_index = lambda which_column: X_L['column_partition'][ 'assignments'][which_column] view_X = get_view_index(X) view_Y = get_view_index(Y) # independent if view_X != view_Y: return 0.0 # get cluster logps view_state = X_L['view_state'][view_X] cluster_logps = su.determine_cluster_crp_logps(view_state) cluster_crps = numpy.exp( cluster_logps) # get exp'ed values for multinomial n_clusters = len(cluster_crps) # get X values x_values = M_c['column_metadata'][X]['code_to_value'].values() # get Y values y_values = M_c['column_metadata'][Y]['code_to_value'].values() # get components models for each cluster for columns X and Y component_models_X = [0] * n_clusters component_models_Y = [0] * n_clusters for i in range(n_clusters): cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i) component_models_X[i] = cluster_models[X] component_models_Y[i] = cluster_models[Y] MI = 0.0 for x in x_values: for y in y_values: # calculate marginal logs Pxy = numpy.zeros(n_clusters) # P(x,y), Joint distribution Px = numpy.zeros(n_clusters) # P(x) Py = numpy.zeros(n_clusters) # P(y) # get logp of x and y in each cluster. add cluster logp's for j in range(n_clusters): Px[j] = component_models_X[j].calc_element_predictive_logp(x) Py[j] = component_models_Y[j].calc_element_predictive_logp(y) Pxy[j] = Px[j] + Py[j] + cluster_logps[ j] # \sum_c P(x|c)P(y|c)P(c), Joint distribution Px[j] += cluster_logps[j] # \sum_c P(x|c)P(c) Py[j] += cluster_logps[j] # \sum_c P(y|c)P(c) # sum over clusters Px = logsumexp(Px) Py = logsumexp(Py) Pxy = logsumexp(Pxy) MI += numpy.exp(Pxy) * (Pxy - (Px + Py)) # ignore MI < 0 if MI <= 0.0: MI = 0.0 return MI