def joint_entropy_score_estimate(subset, X): features = list() for i in subset: features.append(map_continuous_names(np.squeeze(X[:, i]))) X_categorical = np.vstack(features) return entropy_joint(X_categorical)
def compute_unconditional_entropy( windows: np.ndarray, col_id: int, ) -> float: """ compute entropy of all words in a part """ y = windows[:, col_id] uei = drv.entropy_joint(y).item() return uei
def compute_y_entropy( target_windows: np.ndarray, col_id: int, ) -> float: """ compute entropy of next-words """ y = target_windows[:, col_id] # neighbor yei = drv.entropy_joint(y).item() return yei
def _jmi(feature_set, labels, score_list): ### # I(x,y;c) = H(x,c) - H(c) - [ H(x,c,y) - H(c,y) ] + I(y;c) # #### results=[] col = list(feature_set) labels = np.reshape(labels, (1, -1)) for i in col: candidate_f = feature_set[i] candidate_f = np.reshape(candidate_f.values, (1, -1)) print(i+ ' start at ' + str(datetime.datetime.now())) for j in range(int(i)+1, 400): sf = feature_set[str(j)] sf = np.reshape(sf.values, (1,-1)) I_yc = score_list.iloc[j, 0] H_x_c = drv.entropy_conditional(candidate_f, labels) xcy = np.append([candidate_f, labels], [sf], axis=0) H_xcy = drv.entropy_joint(xcy) # cy = np.append([labels], [sf], axis=0) cy = np.concatenate((labels, sf)) cy = np.reshape(cy,(-1,2)) H_cy = drv.entropy_joint(cy) I_xy_c = float(H_x_c - (H_xcy - H_cy) + I_yc) results.append([[i,j], I_xy_c]) if I_xy_c < 0: print() results.append([[i,j], I_xy_c]) df = pd.DataFrame(results) df.to_csv('jmi_2packets.csv') print('end at ' + str(datetime.datetime.now())) df = pd.DataFrame(results) df.to_csv('jmi_2packets.csv')
def measure_vars1(mat: np.array, ) -> Tuple[List[float], List[str]]: """measure info theory variables""" xis, yis = to_x_y(mat) mi = drv.information_mutual_normalised(xis, yis, norm_factor='XY') xy = np.vstack((xis, yis)) je = drv.entropy_joint(xy) xy = drv.entropy_conditional(xis, yis) / je yx = drv.entropy_conditional(yis, xis) / je props = [mi, xy, yx] names = ['I(X;Y)', 'H(X|Y)', 'H(Y|X)'] return props, names
def compute_joint_entropy( target_windows: np.ndarray, col_id: int, ) -> float: """ compute conditional entropy of target words, given distribution of words that follow them. """ # joint entropy x = target_windows[:, -2] # target word y = target_windows[:, col_id] # neighbor x_y = np.vstack((x, y)) jei = drv.entropy_joint(x_y).item() return jei
def mi_ensemble_bound(self, individual_predictions, this_y=None, ensemble_predictions=None): """ Estimate the BER using the Mutual Information-Based Correlation in Tumer and Ghosh (2003). Parameters ---------- individual_predictions: numpy array The dimensions of this array should be |M| by |E|, where |M| is the number of labeled data points and |E| is the number of individual classifiers. Each element should be a probability (not a 0/1 prediction).) """ if this_y is None: this_y = self.y if ensemble_predictions is None: avg_predictor = individual_predictions.mean(axis=1).round() else: avg_predictor = ensemble_predictions.round() individual_predictions = individual_predictions.round() # deal with 0/1 predictions N = individual_predictions.shape[1] # number of classifiers in ensemble labels = np.repeat(this_y.reshape(-1, 1), N, axis=1) accs = np.equal(individual_predictions, labels).mean(axis=0) # mean accuracy for each classifier mean_err = 1 - accs.mean() # mean err for all classifiers ensemble_err = 1 - (this_y == avg_predictor).mean() # mean err for ensemble classifier # calculate average mutual information between each individual classifier's # predictions and the ensemble predictor ami = drv.information_mutual( individual_predictions.T, avg_predictor.reshape(1, -1), base=np.e, cartesian_product=True ).mean() # TODO: should we measure total entropy by discretizing the classification # probabilities into more granular bins? Currently we just use the # 0 / 1 matrix # total entropy in the individual classifiers total_entropy = drv.entropy_joint(individual_predictions.T, base=np.e) # delta is the normalized ami delta = ami / total_entropy assert delta >= 0 assert delta <= 1 # formula from Tumer and Ghosh be = (N * ensemble_err - ((N - 1) * delta + 1) * mean_err ) / ((N - 1) * (1 - delta)) return be
def calc_nmi_score(labels_true, labels_pred): """calculate normalized mutual information score Parameters ---------- labels_true: labels from ground truth labels_pred: labels from clustering Return ------- nmi: normalized mutual information score """ H_true = drv.entropy(labels_true, base=2) H_pred = drv.entropy(labels_pred, base=2) H_joint = drv.entropy_joint([labels_true, labels_pred], base=2) mi = H_true + H_pred - H_joint nmi = mi / max(H_true, H_pred) return nmi
def main(): """Run script""" parser = build_parser() args = parser.parse_args() adata = ad.read_h5ad(args.h5ad) logging.info(f"Read {args.h5ad} for adata of {adata.shape}") if args.discrete: # Use the discrete algorithm from pyitlib # https://pafoster.github.io/pyitlib/#discrete_random_variable.entropy_joint # https://github.com/pafoster/pyitlib/blob/master/pyitlib/discrete_random_variable.py#L3535 # Successive realisations of a random variable are indexed by the last axis in the array; multiple random variables may be specified using preceding axes. # In other words, different variables are axis 0, samples are axis 1 # This is contrary to the default ML format which is samples axis 0, variables axes 1 # Therefore we must transpose input_arr = utils.ensure_arr(adata.X).T h = drv.entropy_joint(input_arr, base=np.e) logging.info(f"Found discrete joint entropy of {h:.6f}") else: raise NotImplementedError
# get outcomes - the words that occur in the last 2 slots of probe windows x_windows = get_windows(prep, corpus.x, col_id=-3) cx, ry, cx_ry = get_outcomes(prep, x_windows) # make co-occurrence matrix cf_mat = np.ones((corpus.num_y, corpus.num_x)) for cxi, ryi in zip(cx, ry): cf_mat[corpus.y.index(ryi), corpus.x.index(cxi)] += 1 # make co-occurrence plot if SHOW_HEATMAP: print(np.max(cf_mat)) print(np.min(cf_mat)) fig, ax = make_heatmap_fig(cf_mat) ce = drv.entropy_conditional(cx, ry).item() je = drv.entropy_joint(cx_ry).item() ye = drv.entropy_joint(ry).item() plt.title( f'Toy Corpus\nH(x-word|y-word)={ce:.4f}\nH(x-word,y-word)={je:.4f}\nH(y-word)={ye:.4f}' ) plt.show() # collect singular values for plotting cf_mat_intact = scale(cf_mat, axis=1, with_std=False, with_mean=False) cf_mat_scaled = scale(cf_mat, axis=1, with_std=False, with_mean=True) # subtracting mean from rows s_intact = np.linalg.svd(cf_mat_intact, compute_uv=False) s_scaled = np.linalg.svd(cf_mat_scaled, compute_uv=False) s_list_intact.append(np.asarray(s_intact[:NUM_S_DIMS])) s_list_scaled.append(np.asarray(s_scaled[:NUM_S_DIMS]))
def jemmig(factors, codes, continuous_factors=True, nb_bins=10): ''' JEMMIG metric from K. Do and T. Tran, “Theory and evaluation metrics for learning disentangled representations,” in ICLR, 2020. :param factors: dataset of factors each column is a factor and each line is a data point :param codes: latent codes associated to the dataset of factors each column is a latent code and each line is a data point :param continuous_factors: True: factors are described as continuous variables False: factors are described as discrete variables :param nb_bins: number of bins to use for discretization ''' # count the number of factors and latent codes nb_factors = factors.shape[1] nb_codes = codes.shape[1] # quantize factors if they are continuous if continuous_factors: factors = minmax_scale(factors) # normalize in [0, 1] all columns factors = get_bin_index(factors, nb_bins) # quantize values and get indexes # quantize latent codes codes = minmax_scale(codes) # normalize in [0, 1] all columns codes = get_bin_index(codes, nb_bins) # quantize values and get indexes # compute mutual information matrix mi_matrix = np.zeros((nb_factors, nb_codes)) for f in range(nb_factors): for c in range(nb_codes): mi_matrix[f, c] = get_mutual_information(factors[:, f], codes[:, c], normalize=False) # compute joint entropy matrix je_matrix = np.zeros((nb_factors, nb_codes)) for f in range(nb_factors): for c in range(nb_codes): X = np.stack((factors[:, f], codes[:, c]), 0) je_matrix[f, c] = drv.entropy_joint(X) # compute the mean gap for all factors sum_gap = 0 for f in range(nb_factors): mi_f = np.sort(mi_matrix[f, :]) je_idx = np.argsort(mi_matrix[f, :])[-1] # Compute unormalized JEMMIG jemmig_not_normalized = je_matrix[f, je_idx] - mi_f[-1] + mi_f[-2] # normalize by H(f) + log(#bins) jemmig_f = jemmig_not_normalized / (drv.entropy_joint(factors[:, f]) + np.log2(nb_bins)) jemmig_f = 1 - jemmig_f sum_gap += jemmig_f # compute the mean gap jemmig_score = sum_gap / nb_factors return jemmig_score
from pyitlib import discrete_random_variable as drv import numpy as np N = 100_000 SHAPES = [(650, 2000), (700, 2000)] SHAPES = [(650, 2000), (650, 2500)] print('joint entropy') for num_x, num_y in SHAPES: res = [] for _ in range(50): xs = np.random.randint(0, num_x, N) ys = np.random.randint(0, num_y, N) xy = np.vstack((xs, ys)) res_i = drv.entropy_joint(xy) res.append(res_i) print(f'{np.mean(res):.4f}') print('conditional entropy x|y') for num_x, num_y in SHAPES: res = [] for _ in range(50): xs = np.random.randint(0, num_x, N) ys = np.random.randint(0, num_y, N) res_i = drv.entropy_conditional(xs, ys) res.append(res_i) print(f'{np.mean(res):.4f}') print('conditional entropy y|x') for num_x, num_y in SHAPES:
def D(data, col1, col2, **kwargs): """Dependence distance for two dataframe columns.""" var_info = drv.information_variation(data[[col1, col2]].T, **kwargs) joint_ent = drv.entropy_joint(data[[col1, col2]].T, **kwargs) D = (var_info/joint_ent)[0, 1] return D
def measure_dvs( params: Params, co_data: CoData, ) -> Dict[str, float]: """ collect all DVs in a single condition. a condition is a specific configuration of IV realizations """ res = {} co_mat_coo: sparse.coo_matrix = co_data.as_matrix(params.direction) co_mat_csr: sparse.csr_matrix = co_mat_coo.tocsr() # save for offline analysis path_to_pkl = configs.Dirs.co_data / f'co_data_age={params.age}' \ f'_punct={params.punctuation}' \ f'_contr={params.targets_control}' \ f'_lemma={params.lemmas}.pkl' with path_to_pkl.open('wb') as f: pickle.dump(co_data, f) # type and token frequency res['x-tokens'] = co_mat_coo.sum().item( ) // 2 if params.direction == 'b' else co_mat_coo.sum().item() res['x-types'] = co_mat_coo.shape[0] res['y-types'] = co_mat_coo.shape[1] # normalize columns if params.normalize_cols: co_mat_csr = normalize(co_mat_csr, axis=1, copy=False) print(co_mat_csr.sum()) # svd # don't use sparse svd: doesn't result in accurate reconstruction. # don't normalize before svd: otherwise relative differences between rows and columns are lost u, s, vt = np.linalg.svd(co_mat_csr.toarray(), compute_uv=True) assert np.max(s) == s[0] res[f's1/sum(s)'] = s[0] / np.sum(s) res[f'frag'] = 1 - (s[0] / np.sum(s)) # info theory analysis if params.direction == 'b': xs, ys, zs = co_data.get_x_y_z() xyz = np.vstack((xs, ys, zs)) xyz_je = drv.entropy_joint(xyz) nii = drv.information_interaction(xyz).item() / xyz_je else: nii = np.nan # need 3 rvs to compute interaction information xs, ys = co_data.get_x_y(params.direction) xy = np.vstack((xs, ys)) xy_je = drv.entropy_joint(xy) # compute entropy on permuted data for de-biasing estimates bias_xy = np.mean([ drv.entropy_conditional(np.random.permutation(xs), np.random.permutation(ys), base=2).item() for _ in range(10) ]) bias_yx = np.mean([ drv.entropy_conditional(np.random.permutation(ys), np.random.permutation(xs), base=2).item() for _ in range(10) ]) print(f'bias_xy={bias_xy:.4f}') print(f'bias_yx={bias_yx:.4f}') xy_ce = drv.entropy_conditional(xs, ys).item() yx_ce = drv.entropy_conditional(ys, xs).item() res[' xy'] = xy_ce # biased res[' yx'] = yx_ce res['dxy'] = bias_xy - xy_ce # de-biased res['dyx'] = bias_yx - yx_ce res['nxy'] = xy_ce / xy_je # biased + normalized res['nyx'] = yx_ce / xy_je # res['nii'] = nii # res['nmi'] = drv.information_mutual_normalised(xs, ys, norm_factor='XY').item() # res['ami'] = adjusted_mutual_info_score(xs, ys, average_method="arithmetic") # res[' je'] = xy_je # round for k, v in res.items(): if isinstance(v, float): res[k] = round(v, 3) if configs.Fig.max_projection > 0: plot_reconstructions(co_mat_coo, params, max_dim=configs.Fig.max_projection) # which row or column is most active in projection on first singular dim? # note: if lemmas=True, row words may include experimental targets # because lemmas of control target plural nouns are singular nouns row_words, col_words = co_data.get_words_ordered_by_id(params.direction) if len(row_words) != co_mat_csr.shape[0]: raise RuntimeError( f'Number of row words ({len(row_words)}) != Number of rows ({co_mat_csr.shape[0]})' ) if len(col_words) != co_mat_csr.shape[1]: raise RuntimeError( f'Number of column words ({len(col_words)}) != Number of columns ({co_mat_csr.shape[1]})' ) projection1 = calc_projection(u, s, vt, 0) max_row_id = np.argmax(projection1.sum(axis=1)) max_col_id = np.argmax(projection1.sum(axis=0)) print( f'Word with largest sum={np.max(projection1.sum(axis=1))} in first projection row="{row_words[max_row_id]}"' ) print( f'Word with largest sum={np.max(projection1.sum(axis=0))} in first projection col="{col_words[max_col_id]}"' ) return res
def _jmim(selected_feature, feature_set, num_to_select, labels, score_list): ### # I(x,y;c) = H(x|c) - [ H(x,c,y) - H(c,y) ] + I(y;c) # #### start = datetime.datetime.now() col = list(feature_set) pool = [] for i in col: candidate_f = feature_set[i] candidate_f = np.reshape(candidate_f.values, (1, -1)) min_jmi = 1000000000 min_feature = [] index = 0 I_xy_c = 0 for sf_packge in selected_feature: # print('round start at ' + str(datetime.datetime.now())) sf = sf_packge[1] sf_idx = sf_packge[0] I_yc = score_list.iloc[sf_idx, 1] sf = np.reshape(sf, (1, -1)) labels = np.reshape(labels, (1, -1)) H_c = drv.entropy(labels) H_x_c = drv.entropy_conditional(candidate_f, labels) xcy = np.append([candidate_f, labels], [sf], axis=0) H_xcy = drv.entropy_joint(xcy) cy = np.append([labels], [sf], axis=0) H_cy = drv.entropy_joint(cy) H_y_c = drv.entropy_conditional(sf, labels) H_cy2 = H_y_c + H_c I_xy_c = H_x_c - (H_xcy - H_cy) + I_yc labels = np.reshape(labels, (-1, 1)) if I_xy_c < min_jmi: min_jmi = I_xy_c min_feature = candidate_f index = int(i) # print(I_xy_c) if I_xy_c < 0: print() pool.append([index, min_feature, min_jmi]) # print('round end at ' + str(datetime.datetime.now())) max_candidate_score = 0 max_candidate_idx = 0 max_candidate = [] for candidate in pool: if float(candidate[2]) > max_candidate_score: max_candidate = candidate[1] max_candidate_idx = candidate[0] max_candidate_score = float(candidate[2]) selected_feature.append( [max_candidate_idx, max_candidate, max_candidate_score]) feature_set.drop(columns=[str(max_candidate_idx)], inplace=True) print( str(len(selected_feature)) + ' ' + str(max_candidate_idx) + ' ' + str(max_candidate_score) + ' at ' + str(datetime.datetime.now() - start)) return selected_feature, feature_set