Esempio n. 1
0
def joint_entropy_score_estimate(subset, X):
    features = list()
    for i in subset:
        features.append(map_continuous_names(np.squeeze(X[:, i])))
    X_categorical = np.vstack(features)

    return entropy_joint(X_categorical)
Esempio n. 2
0
def compute_unconditional_entropy(
    windows: np.ndarray,
    col_id: int,
) -> float:
    """
    compute entropy of all words in a part
    """
    y = windows[:, col_id]
    uei = drv.entropy_joint(y).item()

    return uei
Esempio n. 3
0
def compute_y_entropy(
    target_windows: np.ndarray,
    col_id: int,
) -> float:
    """
    compute  entropy of next-words
    """

    y = target_windows[:, col_id]  # neighbor
    yei = drv.entropy_joint(y).item()

    return yei
Esempio n. 4
0
def _jmi(feature_set, labels, score_list):
    ###
    # I(x,y;c) = H(x,c) - H(c) - [ H(x,c,y) - H(c,y) ] + I(y;c) #
    ####
    results=[]
    col = list(feature_set)
    labels = np.reshape(labels, (1, -1))
    for i in col:
        candidate_f = feature_set[i]
        candidate_f = np.reshape(candidate_f.values, (1, -1))
        print(i+ ' start at ' + str(datetime.datetime.now()))
        for j in range(int(i)+1, 400):
            sf = feature_set[str(j)]
            sf = np.reshape(sf.values, (1,-1))

            I_yc = score_list.iloc[j, 0]
            H_x_c = drv.entropy_conditional(candidate_f, labels)

            xcy = np.append([candidate_f, labels], [sf], axis=0)
            H_xcy = drv.entropy_joint(xcy)

            # cy = np.append([labels], [sf], axis=0)
            cy = np.concatenate((labels, sf))
            cy = np.reshape(cy,(-1,2))
            H_cy = drv.entropy_joint(cy)

            I_xy_c = float(H_x_c - (H_xcy - H_cy) + I_yc)
            results.append([[i,j], I_xy_c])

            if I_xy_c < 0:
                print()

            results.append([[i,j], I_xy_c])
            df = pd.DataFrame(results)
            df.to_csv('jmi_2packets.csv')
        print('end at ' + str(datetime.datetime.now()))
    df = pd.DataFrame(results)
    df.to_csv('jmi_2packets.csv')
Esempio n. 5
0
def measure_vars1(mat: np.array, ) -> Tuple[List[float], List[str]]:
    """measure info theory variables"""

    xis, yis = to_x_y(mat)

    mi = drv.information_mutual_normalised(xis, yis, norm_factor='XY')
    xy = np.vstack((xis, yis))
    je = drv.entropy_joint(xy)
    xy = drv.entropy_conditional(xis, yis) / je
    yx = drv.entropy_conditional(yis, xis) / je

    props = [mi, xy, yx]
    names = ['I(X;Y)', 'H(X|Y)', 'H(Y|X)']

    return props, names
Esempio n. 6
0
def compute_joint_entropy(
    target_windows: np.ndarray,
    col_id: int,
) -> float:
    """
    compute conditional entropy of target words, given distribution of words that follow them.
    
    """

    # joint entropy
    x = target_windows[:, -2]  # target word
    y = target_windows[:, col_id]  # neighbor
    x_y = np.vstack((x, y))
    jei = drv.entropy_joint(x_y).item()

    return jei
    def mi_ensemble_bound(self, individual_predictions, this_y=None, ensemble_predictions=None):
        """
        Estimate the BER using the Mutual Information-Based Correlation in
        Tumer and Ghosh (2003).

        Parameters
        ----------
            individual_predictions: numpy array
                The dimensions of this array should be |M| by |E|, where
                |M| is the number of labeled data points and |E| is the number
                of individual classifiers. Each element should be a probability
                (not a 0/1 prediction).)
        """
        if this_y is None:
            this_y = self.y
        if ensemble_predictions is None:
            avg_predictor = individual_predictions.mean(axis=1).round()
        else:
            avg_predictor = ensemble_predictions.round()
        individual_predictions = individual_predictions.round() # deal with 0/1 predictions
        N = individual_predictions.shape[1]  # number of classifiers in ensemble
        labels = np.repeat(this_y.reshape(-1, 1), N, axis=1)
        accs = np.equal(individual_predictions, labels).mean(axis=0) # mean accuracy for each classifier
        mean_err = 1 - accs.mean() # mean err for all classifiers
        ensemble_err = 1 - (this_y == avg_predictor).mean() # mean err for ensemble classifier

        # calculate average mutual information between each individual classifier's
        # predictions and the ensemble predictor
        ami = drv.information_mutual(
            individual_predictions.T,
            avg_predictor.reshape(1, -1),
            base=np.e,
            cartesian_product=True
        ).mean()
        # TODO: should we measure total entropy by discretizing the classification
        # probabilities into more granular bins? Currently we just use the
        # 0 / 1 matrix
        # total entropy in the individual classifiers
        total_entropy = drv.entropy_joint(individual_predictions.T, base=np.e)
        # delta is the normalized ami
        delta = ami / total_entropy
        assert delta >= 0
        assert delta <= 1
        # formula from Tumer and Ghosh
        be = (N * ensemble_err - ((N - 1) * delta + 1) * mean_err ) / ((N - 1) * (1 - delta))
        return be
Esempio n. 8
0
def calc_nmi_score(labels_true, labels_pred):
    """calculate normalized mutual information score
    
    Parameters
    ----------
    labels_true: labels from ground truth
    labels_pred: labels from clustering

    Return
    -------
    nmi: normalized mutual information score
    """
    H_true = drv.entropy(labels_true, base=2)
    H_pred = drv.entropy(labels_pred, base=2)
    H_joint = drv.entropy_joint([labels_true, labels_pred], base=2)
    mi = H_true + H_pred - H_joint
    nmi = mi / max(H_true, H_pred)
    return nmi
Esempio n. 9
0
def main():
    """Run script"""
    parser = build_parser()
    args = parser.parse_args()

    adata = ad.read_h5ad(args.h5ad)
    logging.info(f"Read {args.h5ad} for adata of {adata.shape}")

    if args.discrete:
        # Use the discrete algorithm from pyitlib
        # https://pafoster.github.io/pyitlib/#discrete_random_variable.entropy_joint
        # https://github.com/pafoster/pyitlib/blob/master/pyitlib/discrete_random_variable.py#L3535
        # Successive realisations of a random variable are indexed by the last axis in the array; multiple random variables may be specified using preceding axes.
        # In other words, different variables are axis 0, samples are axis 1
        # This is contrary to the default ML format which is samples axis 0, variables axes 1
        # Therefore we must transpose
        input_arr = utils.ensure_arr(adata.X).T
        h = drv.entropy_joint(input_arr, base=np.e)
        logging.info(f"Found discrete joint entropy of {h:.6f}")
    else:
        raise NotImplementedError
Esempio n. 10
0
    # get outcomes - the words that occur in the last 2 slots of probe windows
    x_windows = get_windows(prep, corpus.x, col_id=-3)
    cx, ry, cx_ry = get_outcomes(prep, x_windows)

    # make co-occurrence matrix
    cf_mat = np.ones((corpus.num_y, corpus.num_x))
    for cxi, ryi in zip(cx, ry):
        cf_mat[corpus.y.index(ryi), corpus.x.index(cxi)] += 1

    # make co-occurrence plot
    if SHOW_HEATMAP:
        print(np.max(cf_mat))
        print(np.min(cf_mat))
        fig, ax = make_heatmap_fig(cf_mat)
        ce = drv.entropy_conditional(cx, ry).item()
        je = drv.entropy_joint(cx_ry).item()
        ye = drv.entropy_joint(ry).item()
        plt.title(
            f'Toy Corpus\nH(x-word|y-word)={ce:.4f}\nH(x-word,y-word)={je:.4f}\nH(y-word)={ye:.4f}'
        )
        plt.show()

    # collect singular values for plotting
    cf_mat_intact = scale(cf_mat, axis=1, with_std=False, with_mean=False)
    cf_mat_scaled = scale(cf_mat, axis=1, with_std=False,
                          with_mean=True)  # subtracting mean from rows
    s_intact = np.linalg.svd(cf_mat_intact, compute_uv=False)
    s_scaled = np.linalg.svd(cf_mat_scaled, compute_uv=False)
    s_list_intact.append(np.asarray(s_intact[:NUM_S_DIMS]))
    s_list_scaled.append(np.asarray(s_scaled[:NUM_S_DIMS]))
Esempio n. 11
0
def jemmig(factors, codes, continuous_factors=True, nb_bins=10):
    ''' JEMMIG metric from K. Do and T. Tran,
        “Theory and evaluation metrics for learning disentangled representations,”
        in ICLR, 2020.
    
    :param factors:                         dataset of factors
                                            each column is a factor and each line is a data point
    :param codes:                           latent codes associated to the dataset of factors
                                            each column is a latent code and each line is a data point
    :param continuous_factors:              True:   factors are described as continuous variables
                                            False:  factors are described as discrete variables
    :param nb_bins:                         number of bins to use for discretization
    '''
    # count the number of factors and latent codes
    nb_factors = factors.shape[1]
    nb_codes = codes.shape[1]

    # quantize factors if they are continuous
    if continuous_factors:
        factors = minmax_scale(factors)  # normalize in [0, 1] all columns
        factors = get_bin_index(factors,
                                nb_bins)  # quantize values and get indexes

    # quantize latent codes
    codes = minmax_scale(codes)  # normalize in [0, 1] all columns
    codes = get_bin_index(codes, nb_bins)  # quantize values and get indexes

    # compute mutual information matrix
    mi_matrix = np.zeros((nb_factors, nb_codes))
    for f in range(nb_factors):
        for c in range(nb_codes):
            mi_matrix[f, c] = get_mutual_information(factors[:, f],
                                                     codes[:, c],
                                                     normalize=False)

    # compute joint entropy matrix
    je_matrix = np.zeros((nb_factors, nb_codes))
    for f in range(nb_factors):
        for c in range(nb_codes):
            X = np.stack((factors[:, f], codes[:, c]), 0)
            je_matrix[f, c] = drv.entropy_joint(X)

    # compute the mean gap for all factors
    sum_gap = 0
    for f in range(nb_factors):
        mi_f = np.sort(mi_matrix[f, :])
        je_idx = np.argsort(mi_matrix[f, :])[-1]

        # Compute unormalized JEMMIG
        jemmig_not_normalized = je_matrix[f, je_idx] - mi_f[-1] + mi_f[-2]

        # normalize by H(f) + log(#bins)
        jemmig_f = jemmig_not_normalized / (drv.entropy_joint(factors[:, f]) +
                                            np.log2(nb_bins))
        jemmig_f = 1 - jemmig_f
        sum_gap += jemmig_f

    # compute the mean gap
    jemmig_score = sum_gap / nb_factors

    return jemmig_score
from pyitlib import discrete_random_variable as drv
import numpy as np

N = 100_000
SHAPES = [(650, 2000), (700, 2000)]
SHAPES = [(650, 2000), (650, 2500)]

print('joint entropy')
for num_x, num_y in SHAPES:
    res = []
    for _ in range(50):
        xs = np.random.randint(0, num_x, N)
        ys = np.random.randint(0, num_y, N)
        xy = np.vstack((xs, ys))
        res_i = drv.entropy_joint(xy)
        res.append(res_i)
    print(f'{np.mean(res):.4f}')

print('conditional entropy x|y')
for num_x, num_y in SHAPES:
    res = []
    for _ in range(50):
        xs = np.random.randint(0, num_x, N)
        ys = np.random.randint(0, num_y, N)
        res_i = drv.entropy_conditional(xs, ys)
        res.append(res_i)
    print(f'{np.mean(res):.4f}')

print('conditional entropy y|x')
for num_x, num_y in SHAPES:
Esempio n. 13
0
def D(data, col1, col2, **kwargs):
    """Dependence distance for two dataframe columns."""
    var_info = drv.information_variation(data[[col1, col2]].T, **kwargs)
    joint_ent = drv.entropy_joint(data[[col1, col2]].T, **kwargs)
    D = (var_info/joint_ent)[0, 1]
    return D
Esempio n. 14
0
def measure_dvs(
    params: Params,
    co_data: CoData,
) -> Dict[str, float]:
    """
    collect all DVs in a single condition.

    a condition is a specific configuration of IV realizations
    """

    res = {}

    co_mat_coo: sparse.coo_matrix = co_data.as_matrix(params.direction)
    co_mat_csr: sparse.csr_matrix = co_mat_coo.tocsr()

    # save for offline analysis
    path_to_pkl = configs.Dirs.co_data / f'co_data_age={params.age}' \
                                         f'_punct={params.punctuation}' \
                                         f'_contr={params.targets_control}' \
                                         f'_lemma={params.lemmas}.pkl'
    with path_to_pkl.open('wb') as f:
        pickle.dump(co_data, f)

    # type and token frequency
    res['x-tokens'] = co_mat_coo.sum().item(
    ) // 2 if params.direction == 'b' else co_mat_coo.sum().item()
    res['x-types'] = co_mat_coo.shape[0]
    res['y-types'] = co_mat_coo.shape[1]

    # normalize columns
    if params.normalize_cols:
        co_mat_csr = normalize(co_mat_csr, axis=1, copy=False)
        print(co_mat_csr.sum())

    # svd
    # don't use sparse svd: doesn't result in accurate reconstruction.
    # don't normalize before svd: otherwise relative differences between rows and columns are lost
    u, s, vt = np.linalg.svd(co_mat_csr.toarray(), compute_uv=True)
    assert np.max(s) == s[0]
    res[f's1/sum(s)'] = s[0] / np.sum(s)
    res[f'frag'] = 1 - (s[0] / np.sum(s))

    # info theory analysis
    if params.direction == 'b':
        xs, ys, zs = co_data.get_x_y_z()
        xyz = np.vstack((xs, ys, zs))
        xyz_je = drv.entropy_joint(xyz)
        nii = drv.information_interaction(xyz).item() / xyz_je
    else:
        nii = np.nan  # need 3 rvs to compute interaction information
    xs, ys = co_data.get_x_y(params.direction)
    xy = np.vstack((xs, ys))
    xy_je = drv.entropy_joint(xy)

    # compute entropy on permuted data for de-biasing estimates
    bias_xy = np.mean([
        drv.entropy_conditional(np.random.permutation(xs),
                                np.random.permutation(ys),
                                base=2).item() for _ in range(10)
    ])
    bias_yx = np.mean([
        drv.entropy_conditional(np.random.permutation(ys),
                                np.random.permutation(xs),
                                base=2).item() for _ in range(10)
    ])
    print(f'bias_xy={bias_xy:.4f}')
    print(f'bias_yx={bias_yx:.4f}')

    xy_ce = drv.entropy_conditional(xs, ys).item()
    yx_ce = drv.entropy_conditional(ys, xs).item()
    res[' xy'] = xy_ce  # biased
    res[' yx'] = yx_ce
    res['dxy'] = bias_xy - xy_ce  # de-biased
    res['dyx'] = bias_yx - yx_ce
    res['nxy'] = xy_ce / xy_je  # biased + normalized
    res['nyx'] = yx_ce / xy_je
    # res['nii'] = nii
    # res['nmi'] = drv.information_mutual_normalised(xs, ys, norm_factor='XY').item()
    # res['ami'] = adjusted_mutual_info_score(xs, ys, average_method="arithmetic")
    # res[' je'] = xy_je

    # round
    for k, v in res.items():
        if isinstance(v, float):
            res[k] = round(v, 3)

    if configs.Fig.max_projection > 0:
        plot_reconstructions(co_mat_coo,
                             params,
                             max_dim=configs.Fig.max_projection)

    # which row or column is most active in projection on first singular dim?
    # note: if lemmas=True, row words may include experimental targets
    # because lemmas of control target plural nouns are singular nouns
    row_words, col_words = co_data.get_words_ordered_by_id(params.direction)
    if len(row_words) != co_mat_csr.shape[0]:
        raise RuntimeError(
            f'Number of row words ({len(row_words)}) != Number of rows ({co_mat_csr.shape[0]})'
        )
    if len(col_words) != co_mat_csr.shape[1]:
        raise RuntimeError(
            f'Number of column words ({len(col_words)}) != Number of columns ({co_mat_csr.shape[1]})'
        )
    projection1 = calc_projection(u, s, vt, 0)
    max_row_id = np.argmax(projection1.sum(axis=1))
    max_col_id = np.argmax(projection1.sum(axis=0))
    print(
        f'Word with largest sum={np.max(projection1.sum(axis=1))} in first projection row="{row_words[max_row_id]}"'
    )
    print(
        f'Word with largest sum={np.max(projection1.sum(axis=0))} in first projection col="{col_words[max_col_id]}"'
    )

    return res
Esempio n. 15
0
def _jmim(selected_feature, feature_set, num_to_select, labels, score_list):
    ###
    # I(x,y;c) = H(x|c) - [ H(x,c,y) - H(c,y) ] + I(y;c) #
    ####
    start = datetime.datetime.now()
    col = list(feature_set)
    pool = []
    for i in col:
        candidate_f = feature_set[i]
        candidate_f = np.reshape(candidate_f.values, (1, -1))
        min_jmi = 1000000000
        min_feature = []
        index = 0
        I_xy_c = 0
        for sf_packge in selected_feature:
            # print('round start at ' + str(datetime.datetime.now()))
            sf = sf_packge[1]
            sf_idx = sf_packge[0]

            I_yc = score_list.iloc[sf_idx, 1]

            sf = np.reshape(sf, (1, -1))
            labels = np.reshape(labels, (1, -1))
            H_c = drv.entropy(labels)
            H_x_c = drv.entropy_conditional(candidate_f, labels)

            xcy = np.append([candidate_f, labels], [sf], axis=0)
            H_xcy = drv.entropy_joint(xcy)

            cy = np.append([labels], [sf], axis=0)
            H_cy = drv.entropy_joint(cy)
            H_y_c = drv.entropy_conditional(sf, labels)

            H_cy2 = H_y_c + H_c

            I_xy_c = H_x_c - (H_xcy - H_cy) + I_yc

            labels = np.reshape(labels, (-1, 1))
            if I_xy_c < min_jmi:
                min_jmi = I_xy_c
                min_feature = candidate_f
                index = int(i)
        # print(I_xy_c)
        if I_xy_c < 0:
            print()
        pool.append([index, min_feature, min_jmi])
        # print('round end at ' + str(datetime.datetime.now()))

    max_candidate_score = 0
    max_candidate_idx = 0
    max_candidate = []
    for candidate in pool:
        if float(candidate[2]) > max_candidate_score:
            max_candidate = candidate[1]
            max_candidate_idx = candidate[0]
            max_candidate_score = float(candidate[2])

    selected_feature.append(
        [max_candidate_idx, max_candidate, max_candidate_score])
    feature_set.drop(columns=[str(max_candidate_idx)], inplace=True)

    print(
        str(len(selected_feature)) + ' ' + str(max_candidate_idx) + ' ' +
        str(max_candidate_score) + ' at ' +
        str(datetime.datetime.now() - start))

    return selected_feature, feature_set