Example #1
0
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, X_D):
    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial
    n_clusters = len(cluster_crps)

    # get X values
    x_values = M_c['column_metadata'][X]['code_to_value'].values()
    # get Y values
    y_values = M_c['column_metadata'][Y]['code_to_value'].values()

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    MI = 0.0

    for x in x_values:
        for y in y_values:
             # calculate marginal logs
            Pxy = numpy.zeros(n_clusters)   # P(x,y), Joint distribution
            Px = numpy.zeros(n_clusters)    # P(x)
            Py = numpy.zeros(n_clusters)    # P(y)

            # get logp of x and y in each cluster. add cluster logp's
            for j in range(n_clusters):

                Px[j] = component_models_X[j].calc_element_predictive_logp(x)
                Py[j] = component_models_Y[j].calc_element_predictive_logp(y)
                Pxy[j] = Px[j] + Py[j] + cluster_logps[j]   # \sum_c P(x|c)P(y|c)P(c), Joint distribution
                Px[j] += cluster_logps[j]                   # \sum_c P(x|c)P(c)
                Py[j] += cluster_logps[j]                   # \sum_c P(y|c)P(c) 

            # sum over clusters
            Px = logsumexp(Px)
            Py = logsumexp(Py)
            Pxy = logsumexp(Pxy)

            MI += numpy.exp(Pxy)*(Pxy - (Px + Py))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0
        
    return MI
Example #2
0
def simple_predictive_probability_unobserved(M_c, X_L, X_D, Y, query_row,
                                             query_columns, elements):

    n_queries = len(query_columns)

    answer = numpy.zeros(n_queries)
    # answers = numpy.array([])

    for n in range(n_queries):
        query_column = query_columns[n]
        x = elements[n]

        # get the view to which this column is assigned
        view_idx = X_L['column_partition']['assignments'][query_column]
        # get the logps for all the clusters (plus a new one) in this view
        cluster_logps = determine_cluster_logps(M_c, X_L, X_D, Y, query_row,
                                                view_idx)

        answers_n = numpy.zeros(len(cluster_logps))

        # cluster_logps should logsumexp to log(1)
        assert (numpy.abs(logsumexp(cluster_logps)) < .0000001)

        # enumerate over the clusters
        for cluster_idx in range(len(cluster_logps)):

            # get the cluster model for this cluster
            cluster_model = create_cluster_model_from_X_L(
                M_c, X_L, view_idx, cluster_idx)
            # get the specific cluster model for this column
            component_model = cluster_model[query_column]
            # construct draw conataints
            draw_constraints = get_draw_constraints(X_L, X_D, Y, query_row,
                                                    query_column)

            # return the PDF value (exp)
            p_x = component_model.calc_element_predictive_logp_constrained(
                x, draw_constraints)

            answers_n[cluster_idx] = p_x + cluster_logps[cluster_idx]

        answer[n] = logsumexp(answers_n)

    return answer
Example #3
0
def simple_predictive_probability_unobserved(
        M_c, X_L, X_D, Y, query_row, query_columns, elements):

    n_queries = len(query_columns)
    answer = numpy.zeros(n_queries)

    for n in range(n_queries):
        query_column = query_columns[n]
        x = elements[n]

        # Get the view to which this column is assigned.
        view_idx = X_L['column_partition']['assignments'][query_column]
        # Get the logps for all the clusters (plus a new one) in this view.
        cluster_logps = determine_cluster_logps(
            M_c, X_L, X_D, Y, query_row, view_idx)

        answers_n = numpy.zeros(len(cluster_logps))

        # `cluster_logps` should logsumexp to log(1).
        assert numpy.abs(logsumexp(cluster_logps)) < .0000001

        # Enumerate over the clusters
        for cluster_idx in range(len(cluster_logps)):

            # Get the cluster model for this cluster.
            cluster_model = create_cluster_model_from_X_L(
                M_c, X_L, view_idx, cluster_idx)
            # Get the specific cluster model for this column.
            component_model = cluster_model[query_column]
            # Construct draw conataints.
            draw_constraints = get_draw_constraints(
                X_L, X_D, Y, query_row, query_column)

            # Return the PDF value (exp).
            p_x = component_model.calc_element_predictive_logp_constrained(
                x, draw_constraints)

            answers_n[cluster_idx] = p_x + cluster_logps[cluster_idx]

        answer[n] = logsumexp(answers_n)

    return answer
Example #4
0
def determine_cluster_logps(M_c, X_L, X_D, Y, query_row, view_idx):
    view_state_i = X_L['view_state'][view_idx]
    cluster_crp_logps = determine_cluster_crp_logps(view_state_i)
    cluster_crp_logps = numpy.array(cluster_crp_logps)
    cluster_data_logps = determine_cluster_data_logps(M_c, X_L, X_D, Y,
                                                      query_row, view_idx)
    cluster_data_logps = numpy.array(cluster_data_logps)
    # We need to compute the vector of probabilities log[P(Z=j|Y)] where `Z`
    # is the row cluster, `Y` are the constraints, and `j` iterates from 1 to
    # the number of clusters (plus 1 for a new cluster) in the row partition of
    # `view_idx`. Mathematically:
    # log{P(Z=j|Y)} = log{P(Z=j)P(Y|Z=j) / P(Y) }
    #               = log{P(Z=j)} + log{P(Y|Z=j)} - log{sum_k(P(Z=k)P(Y|Z=k))}
    #               = cluster_crp_logps + cluster_data_logps - BAZ
    # The final term BAZ is computed by:
    # log{sum_k(P(Z=k)P(Y|Z=k))}
    # = log{sum_k(exp(log{P(Z=k)}+log{P(Y|Z=k)}))
    # = logsumexp(cluster_crp_logps + cluster_data_logps)
    cluster_logps = cluster_crp_logps + cluster_data_logps - \
        logsumexp(cluster_crp_logps + cluster_data_logps)

    return cluster_logps
Example #5
0
def determine_cluster_logps(M_c, X_L, X_D, Y, query_row, view_idx):
    view_state_i = X_L['view_state'][view_idx]
    cluster_crp_logps = determine_cluster_crp_logps(view_state_i)
    cluster_crp_logps = numpy.array(cluster_crp_logps)
    cluster_data_logps = determine_cluster_data_logps(
        M_c, X_L, X_D, Y, query_row, view_idx)
    cluster_data_logps = numpy.array(cluster_data_logps)
    # We need to compute the vector of probabilities log[P(Z=j|Y)] where `Z`
    # is the row cluster, `Y` are the constraints, and `j` iterates from 1 to
    # the number of clusters (plus 1 for a new cluster) in the row partition of
    # `view_idx`. Mathematically:
    # log{P(Z=j|Y)} = log{P(Z=j)P(Y|Z=j) / P(Y) }
    #               = log{P(Z=j)} + log{P(Y|Z=j)} - log{sum_k(P(Z=k)P(Y|Z=k))}
    #               = cluster_crp_logps + cluster_data_logps - BAZ
    # The final term BAZ is computed by:
    # log{sum_k(P(Z=k)P(Y|Z=k))}
    # = log{sum_k(exp(log{P(Z=k)}+log{P(Y|Z=k)}))
    # = logsumexp(cluster_crp_logps + cluster_data_logps)
    cluster_logps = cluster_crp_logps + cluster_data_logps - \
        logsumexp(cluster_crp_logps + cluster_data_logps)

    return cluster_logps
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, _X_D):
    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = numpy.array(su.determine_cluster_crp_logps(view_state))
    n_clusters = len(cluster_logps)

    # get X values
    x_values = M_c['column_metadata'][X]['code_to_value'].values()
    # get Y values
    y_values = M_c['column_metadata'][Y]['code_to_value'].values()

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    def marginal_predictive_logps_by_cluster(value, component_models):
        return numpy.array([
            component_models[j].calc_element_predictive_logp(value)
            + cluster_logps[j]
            for j in range(n_clusters)])

    x_marginal_predictive_logps_by_cluster = \
        [marginal_predictive_logps_by_cluster(x, component_models_X)
         for x in x_values]

    # \sum_c P(x|c)P(c)
    x_net_marginal_predictive_logps = \
        [logsumexp(ps) for ps in x_marginal_predictive_logps_by_cluster]

    y_marginal_predictive_logps_by_cluster = \
        [marginal_predictive_logps_by_cluster(y, component_models_Y)
         for y in y_values]

    # \sum_c P(y|c)P(c)
    y_net_marginal_predictive_logps = \
        [logsumexp(ps) for ps in y_marginal_predictive_logps_by_cluster]

    MI = 0.0

    for (i,x) in enumerate(x_values):
        x_marginals = x_marginal_predictive_logps_by_cluster[i]
        for (j,y) in enumerate(y_values):
            y_marginals = y_marginal_predictive_logps_by_cluster[j]
            # cluster prob is double-counted in sum of marginals
            joint_predictive_logp_by_cluster = \
                x_marginals + y_marginals - cluster_logps

            # \sum_c P(x|c)P(y|c)P(c), Joint distribution
            joint_predictive_logp = logsumexp(joint_predictive_logp_by_cluster)

            MI += math.exp(joint_predictive_logp) * \
                  (joint_predictive_logp - \
                   (x_net_marginal_predictive_logps[i] + \
                    y_net_marginal_predictive_logps[j]))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0

    return MI
def estimate_MI_sample(X, Y, M_c, X_L, _X_D, get_next_seed, n_samples=1000):
    random_state = numpy.random.RandomState(get_next_seed())

    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial
    n_clusters = len(cluster_crps)

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    # MI = 0.0    # mutual information
    MI = numpy.zeros(n_samples)
    weights = numpy.zeros(n_samples)

    for i in range(n_samples):
        # draw a cluster
        cluster_idx = numpy.nonzero(random_state.multinomial(1, cluster_crps))[0][0]

        # get a sample from each cluster
        x = component_models_X[cluster_idx].get_draw(get_next_seed())
        y = component_models_Y[cluster_idx].get_draw(get_next_seed())

        # calculate marginal logs
        Pxy = numpy.zeros(n_clusters)   # P(x,y), Joint distribution
        Px = numpy.zeros(n_clusters)    # P(x)
        Py = numpy.zeros(n_clusters)    # P(y)

        # get logp of x and y in each cluster. add cluster logp's
        for j in range(n_clusters):

            Px[j] = component_models_X[j].calc_element_predictive_logp(x)
            Py[j] = component_models_Y[j].calc_element_predictive_logp(y)
            Pxy[j] = Px[j] + Py[j] + cluster_logps[j]   # \sum_c P(x|c)P(y|c)P(c), Joint distribution
            Px[j] += cluster_logps[j]                   # \sum_c P(x|c)P(c)
            Py[j] += cluster_logps[j]                   # \sum_c P(y|c)P(c)

        # pdb.set_trace()

        # sum over clusters
        Px = logsumexp(Px)
        Py = logsumexp(Py)
        Pxy = logsumexp(Pxy)

        # add to MI
        # MI += Pxy - (Px + Py)
        MI[i] = Pxy - (Px + Py)
        weights[i] = Pxy

    # do weighted average with underflow protection
    # MI /= float(n_samples)
    Z = logsumexp(weights)
    weights = numpy.exp(weights-Z)
    MI_ret = numpy.sum(MI*weights)

    # ignore MI < 0
    if MI_ret <= 0.0:
        MI_ret = 0.0

    return MI_ret
Example #8
0
def test_logsumexp():
    inf = float('inf')
    nan = float('nan')
    with pytest.raises(OverflowError):
        math.log(sum(map(math.exp, range(1000))))
    assert relerr(999.4586751453871, gu.logsumexp(range(1000))) < 1e-15
    assert gu.logsumexp([]) == -inf
    assert gu.logsumexp([-1000.]) == -1000.
    assert gu.logsumexp([-1000., -1000.]) == -1000. + math.log(2.)
    assert relerr(math.log(2.), gu.logsumexp([0., 0.])) < 1e-15
    assert gu.logsumexp([-inf, 1]) == 1
    assert gu.logsumexp([-inf, -inf]) == -inf
    assert gu.logsumexp([+inf, +inf]) == +inf
    assert math.isnan(gu.logsumexp([-inf, +inf]))
    assert math.isnan(gu.logsumexp([nan, inf]))
    assert math.isnan(gu.logsumexp([nan, -3]))
def test_logsumexp():
    inf = float('inf')
    nan = float('nan')
    with pytest.raises(OverflowError):
        math.log(sum(map(math.exp, range(1000))))
    assert relerr(999.4586751453871, gu.logsumexp(range(1000))) < 1e-15
    assert gu.logsumexp([]) == -inf
    assert gu.logsumexp([-1000.]) == -1000.
    assert gu.logsumexp([-1000., -1000.]) == -1000. + math.log(2.)
    assert relerr(math.log(2.), gu.logsumexp([0., 0.])) < 1e-15
    assert gu.logsumexp([-inf, 1]) == 1
    assert gu.logsumexp([-inf, -inf]) == -inf
    assert gu.logsumexp([+inf, +inf]) == +inf
    assert math.isnan(gu.logsumexp([-inf, +inf]))
    assert math.isnan(gu.logsumexp([nan, inf]))
    assert math.isnan(gu.logsumexp([nan, -3]))
Example #10
0
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, _X_D):
    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = numpy.array(su.determine_cluster_crp_logps(view_state))
    n_clusters = len(cluster_logps)

    # get X values
    x_values = M_c['column_metadata'][X]['code_to_value'].values()
    # get Y values
    y_values = M_c['column_metadata'][Y]['code_to_value'].values()

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    def marginal_predictive_logps_by_cluster(value, component_models):
        return numpy.array([
            component_models[j].calc_element_predictive_logp(value)
            + cluster_logps[j]
            for j in range(n_clusters)])

    x_marginal_predictive_logps_by_cluster = \
        [marginal_predictive_logps_by_cluster(x, component_models_X)
         for x in x_values]

    # \sum_c P(x|c)P(c)
    x_net_marginal_predictive_logps = \
        [logsumexp(ps) for ps in x_marginal_predictive_logps_by_cluster]

    y_marginal_predictive_logps_by_cluster = \
        [marginal_predictive_logps_by_cluster(y, component_models_Y)
         for y in y_values]

    # \sum_c P(y|c)P(c)
    y_net_marginal_predictive_logps = \
        [logsumexp(ps) for ps in y_marginal_predictive_logps_by_cluster]

    MI = 0.0

    for (i,x) in enumerate(x_values):
        x_marginals = x_marginal_predictive_logps_by_cluster[i]
        for (j,y) in enumerate(y_values):
            y_marginals = y_marginal_predictive_logps_by_cluster[j]
            # cluster prob is double-counted in sum of marginals
            joint_predictive_logp_by_cluster = \
                x_marginals + y_marginals - cluster_logps

            # \sum_c P(x|c)P(y|c)P(c), Joint distribution
            joint_predictive_logp = logsumexp(joint_predictive_logp_by_cluster)

            MI += math.exp(joint_predictive_logp) * \
                  (joint_predictive_logp - \
                   (x_net_marginal_predictive_logps[i] + \
                    y_net_marginal_predictive_logps[j]))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0

    return MI
Example #11
0
def estimate_MI_sample(X, Y, M_c, X_L, _X_D, get_next_seed, n_samples=1000):
    random_state = numpy.random.RandomState(get_next_seed())

    get_view_index = lambda which_column: X_L['column_partition']['assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(cluster_logps) # get exp'ed values for multinomial
    n_clusters = len(cluster_crps)

    # get components models for each cluster for columns X and Y
    component_models_X = [0]*n_clusters
    component_models_Y = [0]*n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    # MI = 0.0    # mutual information
    MI = numpy.zeros(n_samples)
    weights = numpy.zeros(n_samples)

    for i in range(n_samples):
        # draw a cluster
        cluster_idx = numpy.nonzero(random_state.multinomial(1, cluster_crps))[0][0]

        # get a sample from each cluster
        x = component_models_X[cluster_idx].get_draw(get_next_seed())
        y = component_models_Y[cluster_idx].get_draw(get_next_seed())

        # calculate marginal logs
        Pxy = numpy.zeros(n_clusters)   # P(x,y), Joint distribution
        Px = numpy.zeros(n_clusters)    # P(x)
        Py = numpy.zeros(n_clusters)    # P(y)

        # get logp of x and y in each cluster. add cluster logp's
        for j in range(n_clusters):

            Px[j] = component_models_X[j].calc_element_predictive_logp(x)
            Py[j] = component_models_Y[j].calc_element_predictive_logp(y)
            Pxy[j] = Px[j] + Py[j] + cluster_logps[j]   # \sum_c P(x|c)P(y|c)P(c), Joint distribution
            Px[j] += cluster_logps[j]                   # \sum_c P(x|c)P(c)
            Py[j] += cluster_logps[j]                   # \sum_c P(y|c)P(c)

        # pdb.set_trace()

        # sum over clusters
        Px = logsumexp(Px)
        Py = logsumexp(Py)
        Pxy = logsumexp(Pxy)

        # add to MI
        # MI += Pxy - (Px + Py)
        MI[i] = Pxy - (Px + Py)
        weights[i] = Pxy

    # do weighted average with underflow protection
    # MI /= float(n_samples)
    Z = logsumexp(weights)
    weights = numpy.exp(weights-Z)
    MI_ret = numpy.sum(MI*weights)

    # ignore MI < 0
    if MI_ret <= 0.0:
        MI_ret = 0.0

    return MI_ret
Example #12
0
def calculate_MI_bounded_discrete(X, Y, M_c, X_L, X_D):
    get_view_index = lambda which_column: X_L['column_partition'][
        'assignments'][which_column]

    view_X = get_view_index(X)
    view_Y = get_view_index(Y)

    # independent
    if view_X != view_Y:
        return 0.0

    # get cluster logps
    view_state = X_L['view_state'][view_X]
    cluster_logps = su.determine_cluster_crp_logps(view_state)
    cluster_crps = numpy.exp(
        cluster_logps)  # get exp'ed values for multinomial
    n_clusters = len(cluster_crps)

    # get X values
    x_values = M_c['column_metadata'][X]['code_to_value'].values()
    # get Y values
    y_values = M_c['column_metadata'][Y]['code_to_value'].values()

    # get components models for each cluster for columns X and Y
    component_models_X = [0] * n_clusters
    component_models_Y = [0] * n_clusters
    for i in range(n_clusters):
        cluster_models = su.create_cluster_model_from_X_L(M_c, X_L, view_X, i)
        component_models_X[i] = cluster_models[X]
        component_models_Y[i] = cluster_models[Y]

    MI = 0.0

    for x in x_values:
        for y in y_values:
            # calculate marginal logs
            Pxy = numpy.zeros(n_clusters)  # P(x,y), Joint distribution
            Px = numpy.zeros(n_clusters)  # P(x)
            Py = numpy.zeros(n_clusters)  # P(y)

            # get logp of x and y in each cluster. add cluster logp's
            for j in range(n_clusters):

                Px[j] = component_models_X[j].calc_element_predictive_logp(x)
                Py[j] = component_models_Y[j].calc_element_predictive_logp(y)
                Pxy[j] = Px[j] + Py[j] + cluster_logps[
                    j]  # \sum_c P(x|c)P(y|c)P(c), Joint distribution
                Px[j] += cluster_logps[j]  # \sum_c P(x|c)P(c)
                Py[j] += cluster_logps[j]  # \sum_c P(y|c)P(c)

            # sum over clusters
            Px = logsumexp(Px)
            Py = logsumexp(Py)
            Pxy = logsumexp(Pxy)

            MI += numpy.exp(Pxy) * (Pxy - (Px + Py))

    # ignore MI < 0
    if MI <= 0.0:
        MI = 0.0

    return MI