Exemple #1
0
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, X_D):
    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial
    n_clusters = len(cluster_crps)

    # get X values
    x_values = M_c['column_metadata'][X]['code_to_value'].values()
    # get Y values
    y_values = M_c['column_metadata'][Y]['code_to_value'].values()

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    MI = 0.0

    for x in x_values:
        for y in y_values:
             # calculate marginal logs
            Pxy = numpy.zeros(n_clusters)   # P(x,y), Joint distribution
            Px = numpy.zeros(n_clusters)    # P(x)
            Py = numpy.zeros(n_clusters)    # P(y)

            # get logp of x and y in each cluster. add cluster logp's
            for j in range(n_clusters):

                Px[j] = component_models_X[j].calc_element_predictive_logp(x)
                Py[j] = component_models_Y[j].calc_element_predictive_logp(y)
                Pxy[j] = Px[j] + Py[j] + cluster_logps[j]   # \sum_c P(x|c)P(y|c)P(c), Joint distribution
                Px[j] += cluster_logps[j]                   # \sum_c P(x|c)P(c)
                Py[j] += cluster_logps[j]                   # \sum_c P(y|c)P(c) 

            # sum over clusters
            Px = logsumexp(Px)
            Py = logsumexp(Py)
            Pxy = logsumexp(Pxy)

            MI += numpy.exp(Pxy)*(Pxy - (Px + Py))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0
        
    return MI
Exemple #2
0
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, X_D):
    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial
    n_clusters = len(cluster_crps)

    # get X values
    x_values = M_c['column_metadata'][X]['code_to_value'].values()
    # get Y values
    y_values = M_c['column_metadata'][Y]['code_to_value'].values()

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    MI = 0.0

    for x in x_values:
        for y in y_values:
             # calculate marginal logs
            Pxy = numpy.zeros(n_clusters)   # P(x,y), Joint distribution
            Px = numpy.zeros(n_clusters)    # P(x)
            Py = numpy.zeros(n_clusters)    # P(y)

            # get logp of x and y in each cluster. add cluster logp's
            for j in range(n_clusters):

                Px[j] = component_models_X[j].calc_element_predictive_logp(x)
                Py[j] = component_models_Y[j].calc_element_predictive_logp(y)
                Pxy[j] = Px[j] + Py[j] + cluster_logps[j]   # \sum_c P(x|c)P(y|c)P(c), Joint distribution
                Px[j] += cluster_logps[j]                   # \sum_c P(x|c)P(c)
                Py[j] += cluster_logps[j]                   # \sum_c P(y|c)P(c) 

            # sum over clusters
            Px = logsumexp(Px)
            Py = logsumexp(Py)
            Pxy = logsumexp(Pxy)

            MI += numpy.exp(Pxy)*(Pxy - (Px + Py))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0
        
    return MI
def sample_from_view(M_c, X_L, X_D, get_next_seed):

    view_col = X_L['column_partition']['assignments'][0]
    view_col2 = X_L['column_partition']['assignments'][1]

    same_view = True
    if view_col2 != view_col:
        same_view = False

    view_state = X_L['view_state'][view_col]
    view_state2 = X_L['view_state'][view_col2]

    cluster_crps = numpy.exp(su.determine_cluster_crp_logps(view_state))
    cluster_crps2 = numpy.exp(su.determine_cluster_crp_logps(view_state2))

    assert (math.fabs(numpy.sum(cluster_crps) - 1) < .00000001)

    samples = numpy.zeros((n, 2))

    cluster_idx1 = numpy.nonzero(numpy.random.multinomial(1,
                                                          cluster_crps))[0][0]
    cluster_model1 = su.create_cluster_model_from_X_L(M_c, X_L, view_col,
                                                      cluster_idx1)

    if same_view:
        cluster_idx2 = cluster_idx1
        cluster_model2 = cluster_model1
    else:
        cluster_idx2 = numpy.nonzero(numpy.random.multinomial(
            1, cluster_crps2))[0][0]
        cluster_model2 = su.create_cluster_model_from_X_L(
            M_c, X_L, view_col2, cluster_idx2)

    component_model1 = cluster_model1[0]
    x = component_model1.get_draw(get_next_seed())

    component_model2 = cluster_model2[1]
    y = component_model2.get_draw(get_next_seed())

    return x, y
def sample_from_view(M_c, X_L, X_D, get_next_seed):
    
    view_col = X_L['column_partition']['assignments'][0]
    view_col2 = X_L['column_partition']['assignments'][1]

    same_view = True
    if view_col2 != view_col:
    	same_view = False

    view_state = X_L['view_state'][view_col]
    view_state2 = X_L['view_state'][view_col2]

    cluster_crps = numpy.exp(su.determine_cluster_crp_logps(view_state))
    cluster_crps2 = numpy.exp(su.determine_cluster_crp_logps(view_state2))

    assert( math.fabs(numpy.sum(cluster_crps) - 1) < .00000001 )

    samples = numpy.zeros((n,2))

    
    cluster_idx1 = numpy.nonzero(numpy.random.multinomial(1, cluster_crps))[0][0]
    cluster_model1 = su.create_cluster_model_from_X_L(M_c, X_L, view_col, cluster_idx1)

    if same_view:
    	cluster_idx2 = cluster_idx1
    	cluster_model2 = cluster_model1
    else:
    	cluster_idx2 = numpy.nonzero(numpy.random.multinomial(1, cluster_crps2))[0][0]
    	cluster_model2 = su.create_cluster_model_from_X_L(M_c, X_L, view_col2, cluster_idx2)

    component_model1 = cluster_model1[0]
    x = component_model1.get_draw(get_next_seed())

    component_model2 = cluster_model2[1]
    y = component_model2.get_draw(get_next_seed())
        
    return x, y
def estimiate_MI_sample(X, Y, M_c, X_L, X_D, get_next_seed, n_samples=1000):

    get_view_index = lambda which_column: X_L['column_partition'][
        'assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(
        cluster_logps)  # get exp'ed values for multinomial
    n_clusters = len(cluster_crps)

    # get components models for each cluster for columns X and Y
    component_models_X = [0] * n_clusters
    component_models_Y = [0] * n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    MI = 0.0  # mutual information

    for _ in range(n_samples):
        # draw a cluster
        cluster_idx = numpy.nonzero(numpy.random.multinomial(
            1, cluster_crps))[0][0]

        # get a sample from each cluster
        x = component_models_X[cluster_idx].get_draw(get_next_seed())
        y = component_models_Y[cluster_idx].get_draw(get_next_seed())

        # calculate marginal logs
        Pxy = numpy.zeros(n_clusters)  # P(x,y), Joint distribution
        Px = numpy.zeros(n_clusters)  # P(x)
        Py = numpy.zeros(n_clusters)  # P(y)

        # get logp of x and y in each cluster. add cluster logp's
        for j in range(n_clusters):

            Px[j] = component_models_X[j].calc_element_predictive_logp(x)
            Py[j] = component_models_Y[j].calc_element_predictive_logp(y)
            Pxy[j] = Px[j] + Py[j] + cluster_logps[
                j]  # \sum_c P(x|c)P(y|c)P(c), Joint distribution
            Px[j] += cluster_logps[j]  # \sum_c P(x|c)P(c)
            Py[j] += cluster_logps[j]  # \sum_c P(y|c)P(c)

        # pdb.set_trace()

        # sum over clusters
        Px = logsumexp(Px)
        Py = logsumexp(Py)
        Pxy = logsumexp(Pxy)

        # add to MI
        MI += Pxy - (Px + Py)

    # average
    MI /= float(n_samples)

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0

    return MI
def estimiate_MI_sample_hist(X,
                             Y,
                             M_c,
                             X_L,
                             X_D,
                             get_next_seed,
                             n_samples=10000):

    get_view_index = lambda which_column: X_L['column_partition'][
        'assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(cluster_logps)
    n_clusters = len(cluster_crps)

    # get components models for each cluster for columns X and Y
    component_models_X = [0] * n_clusters
    component_models_Y = [0] * n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    MI = 0.0
    samples = numpy.zeros((n_samples, 2), dtype=float)
    samples_x = numpy.zeros(n_samples, dtype=float)
    samples_y = numpy.zeros(n_samples, dtype=float)

    # draw the samples
    for i in range(n_samples):
        # draw a cluster
        cluster_idx = numpy.nonzero(numpy.random.multinomial(
            1, cluster_crps))[0][0]

        x = component_models_X[cluster_idx].get_draw(get_next_seed())
        y = component_models_Y[cluster_idx].get_draw(get_next_seed())

        samples[i, 0] = x
        samples[i, 1] = y
        samples_x[i] = x
        samples_y[i] = y

    # calculate the number of bins and ranges
    N = float(n_samples)
    r, _ = corr(samples_x, samples_y)
    k = round(.5 + .5 * (1 + 4 * ((6 * N * r**2.) / (1 - r**2.))**.5)**.5) + 1
    sigma_x = numpy.std(samples_x)
    mu_x = numpy.mean(samples_x)
    sigma_y = numpy.std(samples_y)
    mu_y = numpy.mean(samples_y)
    range_x = numpy.linspace(mu_x - 3. * sigma_x, mu_x + 3 * sigma_x, k)
    range_y = numpy.linspace(mu_y - 3. * sigma_y, mu_y + 3 * sigma_y, k)

    PXY, _, _ = numpy.histogram2d(samples[:, 0],
                                  samples[:, 1],
                                  bins=[range_x, range_y])
    PX, _ = numpy.histogram(samples_x, bins=range_x)
    PY, _ = numpy.histogram(samples_y, bins=range_y)

    MI = 0

    for i_x in range(PXY.shape[0]):
        for i_y in range(PXY.shape[1]):
            Pxy = PXY[i_x, i_y]
            Px = PX[i_x]
            Py = PY[i_y]

            if Pxy > 0.0 and Px > 0.0 and Py > 0.0:
                MI += (Pxy / N) * math.log(Pxy * N / (Px * Py))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0

    return MI
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, _X_D):
    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = numpy.array(su.determine_cluster_crp_logps(view_state))
    n_clusters = len(cluster_logps)

    # get X values
    x_values = M_c['column_metadata'][X]['code_to_value'].values()
    # get Y values
    y_values = M_c['column_metadata'][Y]['code_to_value'].values()

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    def marginal_predictive_logps_by_cluster(value, component_models):
        return numpy.array([
            component_models[j].calc_element_predictive_logp(value)
            + cluster_logps[j]
            for j in range(n_clusters)])

    x_marginal_predictive_logps_by_cluster = \
        [marginal_predictive_logps_by_cluster(x, component_models_X)
         for x in x_values]

    # \sum_c P(x|c)P(c)
    x_net_marginal_predictive_logps = \
        [logsumexp(ps) for ps in x_marginal_predictive_logps_by_cluster]

    y_marginal_predictive_logps_by_cluster = \
        [marginal_predictive_logps_by_cluster(y, component_models_Y)
         for y in y_values]

    # \sum_c P(y|c)P(c)
    y_net_marginal_predictive_logps = \
        [logsumexp(ps) for ps in y_marginal_predictive_logps_by_cluster]

    MI = 0.0

    for (i,x) in enumerate(x_values):
        x_marginals = x_marginal_predictive_logps_by_cluster[i]
        for (j,y) in enumerate(y_values):
            y_marginals = y_marginal_predictive_logps_by_cluster[j]
            # cluster prob is double-counted in sum of marginals
            joint_predictive_logp_by_cluster = \
                x_marginals + y_marginals - cluster_logps

            # \sum_c P(x|c)P(y|c)P(c), Joint distribution
            joint_predictive_logp = logsumexp(joint_predictive_logp_by_cluster)

            MI += math.exp(joint_predictive_logp) * \
                  (joint_predictive_logp - \
                   (x_net_marginal_predictive_logps[i] + \
                    y_net_marginal_predictive_logps[j]))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0

    return MI
def estimate_MI_sample(X, Y, M_c, X_L, _X_D, get_next_seed, n_samples=1000):
    random_state = numpy.random.RandomState(get_next_seed())

    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial
    n_clusters = len(cluster_crps)

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    # MI = 0.0    # mutual information
    MI = numpy.zeros(n_samples)
    weights = numpy.zeros(n_samples)

    for i in range(n_samples):
        # draw a cluster
        cluster_idx = numpy.nonzero(random_state.multinomial(1, cluster_crps))[0][0]

        # get a sample from each cluster
        x = component_models_X[cluster_idx].get_draw(get_next_seed())
        y = component_models_Y[cluster_idx].get_draw(get_next_seed())

        # calculate marginal logs
        Pxy = numpy.zeros(n_clusters)   # P(x,y), Joint distribution
        Px = numpy.zeros(n_clusters)    # P(x)
        Py = numpy.zeros(n_clusters)    # P(y)

        # get logp of x and y in each cluster. add cluster logp's
        for j in range(n_clusters):

            Px[j] = component_models_X[j].calc_element_predictive_logp(x)
            Py[j] = component_models_Y[j].calc_element_predictive_logp(y)
            Pxy[j] = Px[j] + Py[j] + cluster_logps[j]   # \sum_c P(x|c)P(y|c)P(c), Joint distribution
            Px[j] += cluster_logps[j]                   # \sum_c P(x|c)P(c)
            Py[j] += cluster_logps[j]                   # \sum_c P(y|c)P(c)

        # pdb.set_trace()

        # sum over clusters
        Px = logsumexp(Px)
        Py = logsumexp(Py)
        Pxy = logsumexp(Pxy)

        # add to MI
        # MI += Pxy - (Px + Py)
        MI[i] = Pxy - (Px + Py)
        weights[i] = Pxy

    # do weighted average with underflow protection
    # MI /= float(n_samples)
    Z = logsumexp(weights)
    weights = numpy.exp(weights-Z)
    MI_ret = numpy.sum(MI*weights)

    # ignore MI < 0
    if MI_ret <= 0.0:
        MI_ret = 0.0

    return MI_ret
Exemple #9
0
def estimiate_MI_sample_hist(X, Y, M_c, X_L, X_D, get_next_seed, n_samples=10000):
    
    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:        
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(cluster_logps)
    n_clusters = len(cluster_crps)

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    MI = 0.0
    samples = numpy.zeros((n_samples,2), dtype=float)
    samples_x = numpy.zeros(n_samples, dtype=float)
    samples_y = numpy.zeros(n_samples, dtype=float)

    # draw the samples
    for i in range(n_samples):
        # draw a cluster 
        cluster_idx = numpy.nonzero(numpy.random.multinomial(1, cluster_crps))[0][0]

        x = component_models_X[cluster_idx].get_draw(get_next_seed())
        y = component_models_Y[cluster_idx].get_draw(get_next_seed())

        samples[i,0] = x
        samples[i,1] = y
        samples_x[i] = x
        samples_y[i] = y

    # calculate the number of bins and ranges
    N = float(n_samples)
    r,_ = corr(samples_x, samples_y)
    k = round(.5+.5*(1+4*((6*N*r**2.)/(1-r**2.))**.5)**.5)+1
    sigma_x = numpy.std(samples_x)
    mu_x = numpy.mean(samples_x)
    sigma_y = numpy.std(samples_y)
    mu_y = numpy.mean(samples_y)
    range_x = numpy.linspace(mu_x-3.*sigma_x,mu_x+3*sigma_x,k)
    range_y = numpy.linspace(mu_y-3.*sigma_y,mu_y+3*sigma_y,k)


    PXY, _, _ = numpy.histogram2d(samples[:,0], samples[:,1], bins=[range_x,range_y])
    PX,_ = numpy.histogram(samples_x,bins=range_x)
    PY,_ = numpy.histogram(samples_y,bins=range_y)

    MI = 0

    for i_x in range(PXY.shape[0]):
        for i_y in range(PXY.shape[1]):            
            Pxy = PXY[i_x,i_y]
            Px = PX[i_x]
            Py = PY[i_y]
            
            if Pxy > 0.0 and Px > 0.0 and Py > 0.0:
                MI += (Pxy/N)*math.log(Pxy*N/(Px*Py))
            


    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0
        
    return MI
Exemple #10
0
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, _X_D):
    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = numpy.array(su.determine_cluster_crp_logps(view_state))
    n_clusters = len(cluster_logps)

    # get X values
    x_values = M_c['column_metadata'][X]['code_to_value'].values()
    # get Y values
    y_values = M_c['column_metadata'][Y]['code_to_value'].values()

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    def marginal_predictive_logps_by_cluster(value, component_models):
        return numpy.array([
            component_models[j].calc_element_predictive_logp(value)
            + cluster_logps[j]
            for j in range(n_clusters)])

    x_marginal_predictive_logps_by_cluster = \
        [marginal_predictive_logps_by_cluster(x, component_models_X)
         for x in x_values]

    # \sum_c P(x|c)P(c)
    x_net_marginal_predictive_logps = \
        [logsumexp(ps) for ps in x_marginal_predictive_logps_by_cluster]

    y_marginal_predictive_logps_by_cluster = \
        [marginal_predictive_logps_by_cluster(y, component_models_Y)
         for y in y_values]

    # \sum_c P(y|c)P(c)
    y_net_marginal_predictive_logps = \
        [logsumexp(ps) for ps in y_marginal_predictive_logps_by_cluster]

    MI = 0.0

    for (i,x) in enumerate(x_values):
        x_marginals = x_marginal_predictive_logps_by_cluster[i]
        for (j,y) in enumerate(y_values):
            y_marginals = y_marginal_predictive_logps_by_cluster[j]
            # cluster prob is double-counted in sum of marginals
            joint_predictive_logp_by_cluster = \
                x_marginals + y_marginals - cluster_logps

            # \sum_c P(x|c)P(y|c)P(c), Joint distribution
            joint_predictive_logp = logsumexp(joint_predictive_logp_by_cluster)

            MI += math.exp(joint_predictive_logp) * \
                  (joint_predictive_logp - \
                   (x_net_marginal_predictive_logps[i] + \
                    y_net_marginal_predictive_logps[j]))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0

    return MI