def collect_data(ws: np.ndarray) -> Tuple[float, float]: ############### # val1: use all windows; this provides a control/reference from which to measure difference to val2 # (should be 0.0 when using toy corpus) ############### # val1 x1 = ws[:, -2] # all words y1 = ws[:, -2 + DISTANCE] # neighbors val1i = drv.entropy_conditional(x1, y1).item() / drv.entropy(x1).item() ############### # val2: use only target windows # in theory, this should be invariant to number of target types, ############### # target windows row_ids = np.isin(ws[:, -2], target_ids) target_windows = ws[row_ids] # val2 x2 = target_windows[:, -2] # target y2 = target_windows[:, -2 + DISTANCE] # neighbors val2i = drv.entropy_conditional(x2, y2).item() / drv.entropy(x2).item() print(f'{len(ws):>12,} | val1={val1i:.3f} val2={val2i:.2f}') return val1i, val2i
def measure_vars1(mat: np.array, ) -> Tuple[List[float], List[str]]: """measure info theory variables""" xis, yis = to_x_y(mat) mi = drv.information_mutual_normalised(xis, yis, norm_factor='XY') xy = np.vstack((xis, yis)) je = drv.entropy_joint(xy) xy = drv.entropy_conditional(xis, yis) / je yx = drv.entropy_conditional(yis, xis) / je props = [mi, xy, yx] names = ['I(X;Y)', 'H(X|Y)', 'H(Y|X)'] return props, names
def make_candidate(m: np.array, ) -> Candidate: # convert matrix to RVs xs, ys = to_pyitlib_format(m) u, s, v = randomized_svd(m, n_components=m.shape[1]) return Candidate( matrix=m, df=pd.DataFrame(data=to_columnar(m)), # matrix in df format hxy=drv.entropy_conditional(xs, ys).round(2), hyx=drv.entropy_conditional(ys, xs).round(2), ami=adjusted_mutual_info_score(xs, ys, average_method="arithmetic").round(2), s1p=s[0] / s.sum(), )
def Hcond(self): """ The conditional entropy matrix (i, j) = H(Xi | Xj). A pandas Dataframe or a 2D numpy array depending on dataset type """ if self._Hcond is None: # Old attempt to do it ourselves # (0) init H # nb_vars = len(df.columns) # H = np.empty((nb_vars, nb_vars), dtype=float) # (1) for each column compute the counts per value # (2) for each (i, j) pair compute the counts # Using pyitlib to compute H (hopefully efficiently) # Unfortunately this does not work with numpy arrays, convert to pandas TODO report # note: we convert to string type to avoid a bug with ints. TODO... self._Hcond = drv.entropy_conditional( self.dataset_df.T.astype(str)) # add the row/column headers if not self.is_nparray: self._Hcond = pd.DataFrame(self._Hcond, index=self.varnames, columns=self.varnames) # basic sanity check: should all be positive assert np.all(self._Hcond >= 0) return self._Hcond
def conditional_entropy_in_dct(list_36_gray_img, list_64_gray_img): from scipy.fftpack import fft, dct from pyitlib import discrete_random_variable as drv # Variaveis list_36_dct = [] list_36_conditional_entropy = [] chain_list_36_dct = [] chain_list_36_origin = [] # Converter cada bloco em DCT-II for i in range(len(list_36_gray_img)): list_36_dct.append(dct(list_36_gray_img[i])) for i in range(len(list_36_dct)): chain_list_36_dct.append(np.concatenate(list_36_dct[i]).ravel()) for i in range(len(list_36_gray_img)): chain_list_36_origin.append( np.concatenate(list_36_gray_img[i]).ravel()) # Aplicar a Conditional Entropy para cada bloco/slice for i in range(len(chain_list_36_dct)): list_36_conditional_entropy.append( drv.entropy_conditional(chain_list_36_dct[i], chain_list_36_origin[i], base=np.exp(2))) return list_36_conditional_entropy
def calculate_weights(self, discretized_data: pd.DataFrame): """ Provide calculation of link strength according mutual information between node and its parent(-s) values. """ import bamt.utils.GraphUtils as gru if not all([ i in ['disc', 'disc_num'] for i in gru.nodes_types(discretized_data).values() ]): logger_network.error( f"calculate_weghts() method deals only with discrete data. Continuous data: " + f"{[col for col, type in gru.nodes_types(discretized_data).items() if type not in ['disc', 'disc_num']]}" ) if not self.edges: logger_network.error( "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method" ) if not self.nodes: logger_network.error( "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method" ) weights = dict() for node in self.nodes: parents = node.cont_parents + node.disc_parents if parents is None: continue y = discretized_data[node.name].values if len(parents) == 1: x = discretized_data[parents[0]].values LS_true = drv.information_mutual(X=y, Y=x) entropy = drv.entropy(X=y) weight = LS_true / entropy weights[(parents[0], node.name)] = weight else: for parent_node in parents: x = discretized_data[parent_node].values other_parents = [ tmp for tmp in parents if tmp != parent_node ] z = list() for other_parent in other_parents: z.append(list(discretized_data[other_parent].values)) LS_true = np.average( drv.information_mutual_conditional( X=y, Y=x, Z=z, cartesian_product=True)) entropy = np.average( drv.entropy_conditional( X=y, Y=z, cartesian_product=True)) + 1e-8 weight = LS_true / entropy weights[(parent_node, node.name)] = weight self.weights = weights
def theils_u(self, x, y): s_xy = drv.entropy_conditional(x, y) x_counter = Counter(x) total_occurrences = sum(x_counter.values()) p_x = list(map(lambda n: n / total_occurrences, x_counter.values())) s_x = drv.entropy(p_x) if s_x == 0: return 1 else: return (s_x - s_xy) / s_x
def compute_norm_cond_entropy_corr(data_df, attrs_from, attrs_to): """ Computes the correlations between attributes by calculating the normalized conditional entropy between them. The conditional entropy is asymmetric, therefore we need pairwise computation. The computed correlations are stored in a dictionary in the format: { attr_a: { cond_attr_i: corr_strength_a_i, cond_attr_j: corr_strength_a_j, ... }, attr_b: { cond_attr_i: corr_strength_b_i, ...} } :return a dictionary of correlations """ corr = {} # Compute pair-wise conditional entropy. for x in attrs_from: corr[x] = {} for y in attrs_to: # Set correlation to 1 for same attributes. if x == y: corr[x][y] = 1.0 continue xy_df = data_df[[x, y]] xy_df = xy_df.loc[~(xy_df[x] == NULL_REPR) & ~(xy_df[y] == NULL_REPR)] x_vals = xy_df[x] x_domain_size = x_vals.nunique() # Set correlation to 0.0 if entropy of x is 1 (only one possible value). if x_domain_size == 1 or len(xy_df) == 0: corr[x][y] = 0.0 continue # Compute the conditional entropy H(x|y) = H(x,y) - H(y). # H(x,y) denotes H(x U y). # If H(x|y) = 0, then y determines x, i.e., y -> x. # Use the domain size of x as a log base for normalization. y_vals = xy_df[y] x_y_entropy = drv.entropy_conditional(x_vals, y_vals, base=x_domain_size).item() # The conditional entropy is 0 for strongly correlated attributes and 1 for # completely independent attributes. We reverse this to reflect the correlation. corr[x][y] = 1.0 - x_y_entropy return corr
def NTE_Measure(x, y, maxlen): p = 0 n = len(y) settings = {} settings['history_target'] = maxlen settings['tau_sources'] = 2 net = JidtGaussianTE(settings) for i in range(30): idx = np.random.permutation(1750) x_shuffle = x[idx] p = p + net.estimate(x_shuffle, y) p = p / 30 texy = net.estimate(x, y) info = drv.entropy_conditional(y[1:n], y[0:n - 1], estimator="MINIMAX") return (texy - p) / info
def _jmi(feature_set, labels, score_list): ### # I(x,y;c) = H(x,c) - H(c) - [ H(x,c,y) - H(c,y) ] + I(y;c) # #### results=[] col = list(feature_set) labels = np.reshape(labels, (1, -1)) for i in col: candidate_f = feature_set[i] candidate_f = np.reshape(candidate_f.values, (1, -1)) print(i+ ' start at ' + str(datetime.datetime.now())) for j in range(int(i)+1, 400): sf = feature_set[str(j)] sf = np.reshape(sf.values, (1,-1)) I_yc = score_list.iloc[j, 0] H_x_c = drv.entropy_conditional(candidate_f, labels) xcy = np.append([candidate_f, labels], [sf], axis=0) H_xcy = drv.entropy_joint(xcy) # cy = np.append([labels], [sf], axis=0) cy = np.concatenate((labels, sf)) cy = np.reshape(cy,(-1,2)) H_cy = drv.entropy_joint(cy) I_xy_c = float(H_x_c - (H_xcy - H_cy) + I_yc) results.append([[i,j], I_xy_c]) if I_xy_c < 0: print() results.append([[i,j], I_xy_c]) df = pd.DataFrame(results) df.to_csv('jmi_2packets.csv') print('end at ' + str(datetime.datetime.now())) df = pd.DataFrame(results) df.to_csv('jmi_2packets.csv')
num_entropic_observations = N // fraction num_remaining_observations = N - num_entropic_observations xs = [np.random.choice(vocab_x, size=N, p=p_x) for _ in range(2)] ys = [ np.hstack(( np.random.choice( vocab_y[:x_tick], size=num_entropic_observations, p=None ), # no probabilities because these contexts are entropic np.random.choice(vocab_y[x_tick:], size=num_remaining_observations, p=p_y[x_tick:] / p_y[x_tick:].sum()))) for _ in range(2) ] raw = np.mean([drv.entropy_conditional(x, y) for x, y in zip(xs, ys)]) # raw is never biased fraction2xys_age1[fraction].append(raw) # no entropic contexts - and larger shape (simulates age group 2) if RESPECT_SHAPE_DIFF: num_rows_age2 = NUM_ROWS + 30 num_cols_age2 = NUM_ROWS + 600 else: num_rows_age2 = NUM_ROWS + 0 num_cols_age2 = NUM_ROWS + 0 xs = [np.random.choice(vocab_x, size=N, p=p_x) for _ in range(2)] ys = [np.random.choice(vocab_y, size=N, p=p_y) for _ in range(2)]
# get outcomes - the words that occur in the last 2 slots of probe windows x_windows = get_windows(prep, corpus.x, col_id=-3) cx, ry, cx_ry = get_outcomes(prep, x_windows) # make co-occurrence matrix cf_mat = np.ones((corpus.num_y, corpus.num_x)) for cxi, ryi in zip(cx, ry): cf_mat[corpus.y.index(ryi), corpus.x.index(cxi)] += 1 # make co-occurrence plot if SHOW_HEATMAP: print(np.max(cf_mat)) print(np.min(cf_mat)) fig, ax = make_heatmap_fig(cf_mat) ce = drv.entropy_conditional(cx, ry).item() je = drv.entropy_joint(cx_ry).item() ye = drv.entropy_joint(ry).item() plt.title( f'Toy Corpus\nH(x-word|y-word)={ce:.4f}\nH(x-word,y-word)={je:.4f}\nH(y-word)={ye:.4f}' ) plt.show() # collect singular values for plotting cf_mat_intact = scale(cf_mat, axis=1, with_std=False, with_mean=False) cf_mat_scaled = scale(cf_mat, axis=1, with_std=False, with_mean=True) # subtracting mean from rows s_intact = np.linalg.svd(cf_mat_intact, compute_uv=False) s_scaled = np.linalg.svd(cf_mat_scaled, compute_uv=False) s_list_intact.append(np.asarray(s_intact[:NUM_S_DIMS])) s_list_scaled.append(np.asarray(s_scaled[:NUM_S_DIMS]))
def measure_dvs( params: Params, co_data: CoData, ) -> Dict[str, float]: """ collect all DVs in a single condition. a condition is a specific configuration of IV realizations """ res = {} co_mat_coo: sparse.coo_matrix = co_data.as_matrix(params.direction) co_mat_csr: sparse.csr_matrix = co_mat_coo.tocsr() # save for offline analysis path_to_pkl = configs.Dirs.co_data / f'co_data_age={params.age}' \ f'_punct={params.punctuation}' \ f'_contr={params.targets_control}' \ f'_lemma={params.lemmas}.pkl' with path_to_pkl.open('wb') as f: pickle.dump(co_data, f) # type and token frequency res['x-tokens'] = co_mat_coo.sum().item( ) // 2 if params.direction == 'b' else co_mat_coo.sum().item() res['x-types'] = co_mat_coo.shape[0] res['y-types'] = co_mat_coo.shape[1] # normalize columns if params.normalize_cols: co_mat_csr = normalize(co_mat_csr, axis=1, copy=False) print(co_mat_csr.sum()) # svd # don't use sparse svd: doesn't result in accurate reconstruction. # don't normalize before svd: otherwise relative differences between rows and columns are lost u, s, vt = np.linalg.svd(co_mat_csr.toarray(), compute_uv=True) assert np.max(s) == s[0] res[f's1/sum(s)'] = s[0] / np.sum(s) res[f'frag'] = 1 - (s[0] / np.sum(s)) # info theory analysis if params.direction == 'b': xs, ys, zs = co_data.get_x_y_z() xyz = np.vstack((xs, ys, zs)) xyz_je = drv.entropy_joint(xyz) nii = drv.information_interaction(xyz).item() / xyz_je else: nii = np.nan # need 3 rvs to compute interaction information xs, ys = co_data.get_x_y(params.direction) xy = np.vstack((xs, ys)) xy_je = drv.entropy_joint(xy) # compute entropy on permuted data for de-biasing estimates bias_xy = np.mean([ drv.entropy_conditional(np.random.permutation(xs), np.random.permutation(ys), base=2).item() for _ in range(10) ]) bias_yx = np.mean([ drv.entropy_conditional(np.random.permutation(ys), np.random.permutation(xs), base=2).item() for _ in range(10) ]) print(f'bias_xy={bias_xy:.4f}') print(f'bias_yx={bias_yx:.4f}') xy_ce = drv.entropy_conditional(xs, ys).item() yx_ce = drv.entropy_conditional(ys, xs).item() res[' xy'] = xy_ce # biased res[' yx'] = yx_ce res['dxy'] = bias_xy - xy_ce # de-biased res['dyx'] = bias_yx - yx_ce res['nxy'] = xy_ce / xy_je # biased + normalized res['nyx'] = yx_ce / xy_je # res['nii'] = nii # res['nmi'] = drv.information_mutual_normalised(xs, ys, norm_factor='XY').item() # res['ami'] = adjusted_mutual_info_score(xs, ys, average_method="arithmetic") # res[' je'] = xy_je # round for k, v in res.items(): if isinstance(v, float): res[k] = round(v, 3) if configs.Fig.max_projection > 0: plot_reconstructions(co_mat_coo, params, max_dim=configs.Fig.max_projection) # which row or column is most active in projection on first singular dim? # note: if lemmas=True, row words may include experimental targets # because lemmas of control target plural nouns are singular nouns row_words, col_words = co_data.get_words_ordered_by_id(params.direction) if len(row_words) != co_mat_csr.shape[0]: raise RuntimeError( f'Number of row words ({len(row_words)}) != Number of rows ({co_mat_csr.shape[0]})' ) if len(col_words) != co_mat_csr.shape[1]: raise RuntimeError( f'Number of column words ({len(col_words)}) != Number of columns ({co_mat_csr.shape[1]})' ) projection1 = calc_projection(u, s, vt, 0) max_row_id = np.argmax(projection1.sum(axis=1)) max_col_id = np.argmax(projection1.sum(axis=0)) print( f'Word with largest sum={np.max(projection1.sum(axis=1))} in first projection row="{row_words[max_row_id]}"' ) print( f'Word with largest sum={np.max(projection1.sum(axis=0))} in first projection col="{col_words[max_col_id]}"' ) return res
print('joint entropy') for num_x, num_y in SHAPES: res = [] for _ in range(50): xs = np.random.randint(0, num_x, N) ys = np.random.randint(0, num_y, N) xy = np.vstack((xs, ys)) res_i = drv.entropy_joint(xy) res.append(res_i) print(f'{np.mean(res):.4f}') print('conditional entropy x|y') for num_x, num_y in SHAPES: res = [] for _ in range(50): xs = np.random.randint(0, num_x, N) ys = np.random.randint(0, num_y, N) res_i = drv.entropy_conditional(xs, ys) res.append(res_i) print(f'{np.mean(res):.4f}') print('conditional entropy y|x') for num_x, num_y in SHAPES: res = [] for _ in range(50): xs = np.random.randint(0, num_x, N) ys = np.random.randint(0, num_y, N) res_i = drv.entropy_conditional(ys, xs) res.append(res_i) print(f'{np.mean(res):.4f}')
def info_gain(feature_vals: np.ndarray, y_vals: np.ndarray) -> float: h_y = drv.entropy(y_vals) h_y_given_x = drv.entropy_conditional(y_vals, feature_vals) return h_y - h_y_given_x
def get_entropy(dataset, sensitive_attr, top_n=5): sensitive_index = dataset.feature_names.index(sensitive_attr) res = [] #Independent entropy res.append(drv.entropy(dataset.features[:, sensitive_index])) res.append(drv.entropy(dataset.labels[:, 0])) entropy_feats = [] for i in range(0, dataset.features.shape[1]): if i == sensitive_index: continue entropy_feats.append(drv.entropy(dataset.features[:, i])) entropy_feats.sort(reverse=True) res += entropy_feats[:5] res += entropy_feats[-5:] #Independent entropy #Cross entropy res.append( drv.entropy_conditional(dataset.features[:, sensitive_index], dataset.labels[:, 0])) res.append( drv.entropy_conditional(dataset.labels[:, 0], dataset.features[:, sensitive_index])) cross_entropy_A = [] cross_entropy_B = [] for i in range(0, dataset.features.shape[1]): if i == sensitive_index: continue cross_entropy_A.append( drv.entropy_conditional(dataset.features[:, sensitive_index], dataset.features[:, i])) cross_entropy_B.append( drv.entropy_conditional(dataset.features[:, i], dataset.features[:, sensitive_index])) cross_entropy_A.sort(reverse=True) cross_entropy_B.sort(reverse=True) res += cross_entropy_A[:5] res += cross_entropy_A[-5:] res += cross_entropy_B[:5] res += cross_entropy_B[-5:] cross_entropy_A = [] cross_entropy_B = [] for i in range(0, dataset.features.shape[1]): if i == sensitive_index: continue cross_entropy_A.append( drv.entropy_conditional(dataset.labels[:, 0], dataset.features[:, i])) cross_entropy_B.append( drv.entropy_conditional(dataset.features[:, i], dataset.labels[:, 0])) cross_entropy_A.sort(reverse=True) cross_entropy_B.sort(reverse=True) res += cross_entropy_A[:5] res += cross_entropy_A[-5:] res += cross_entropy_B[:5] res += cross_entropy_B[-5:] #Cross entropy for i in range(0, len(res)): res[i] = float(res[i]) return res
def _jmim(selected_feature, feature_set, num_to_select, labels, score_list): ### # I(x,y;c) = H(x|c) - [ H(x,c,y) - H(c,y) ] + I(y;c) # #### start = datetime.datetime.now() col = list(feature_set) pool = [] for i in col: candidate_f = feature_set[i] candidate_f = np.reshape(candidate_f.values, (1, -1)) min_jmi = 1000000000 min_feature = [] index = 0 I_xy_c = 0 for sf_packge in selected_feature: # print('round start at ' + str(datetime.datetime.now())) sf = sf_packge[1] sf_idx = sf_packge[0] I_yc = score_list.iloc[sf_idx, 1] sf = np.reshape(sf, (1, -1)) labels = np.reshape(labels, (1, -1)) H_c = drv.entropy(labels) H_x_c = drv.entropy_conditional(candidate_f, labels) xcy = np.append([candidate_f, labels], [sf], axis=0) H_xcy = drv.entropy_joint(xcy) cy = np.append([labels], [sf], axis=0) H_cy = drv.entropy_joint(cy) H_y_c = drv.entropy_conditional(sf, labels) H_cy2 = H_y_c + H_c I_xy_c = H_x_c - (H_xcy - H_cy) + I_yc labels = np.reshape(labels, (-1, 1)) if I_xy_c < min_jmi: min_jmi = I_xy_c min_feature = candidate_f index = int(i) # print(I_xy_c) if I_xy_c < 0: print() pool.append([index, min_feature, min_jmi]) # print('round end at ' + str(datetime.datetime.now())) max_candidate_score = 0 max_candidate_idx = 0 max_candidate = [] for candidate in pool: if float(candidate[2]) > max_candidate_score: max_candidate = candidate[1] max_candidate_idx = candidate[0] max_candidate_score = float(candidate[2]) selected_feature.append( [max_candidate_idx, max_candidate, max_candidate_score]) feature_set.drop(columns=[str(max_candidate_idx)], inplace=True) print( str(len(selected_feature)) + ' ' + str(max_candidate_idx) + ' ' + str(max_candidate_score) + ' at ' + str(datetime.datetime.now() - start)) return selected_feature, feature_set