Beispiel #1
0
def collect_data(ws: np.ndarray) -> Tuple[float, float]:

    ###############
    # val1: use all windows; this provides a control/reference from which to measure difference to val2
    # (should be 0.0 when using toy corpus)
    ###############

    # val1
    x1 = ws[:, -2]  # all words
    y1 = ws[:, -2 + DISTANCE]  # neighbors
    val1i = drv.entropy_conditional(x1, y1).item() / drv.entropy(x1).item()

    ###############
    # val2: use only target windows
    # in theory, this should be invariant to number of target types,
    ###############

    # target windows
    row_ids = np.isin(ws[:, -2], target_ids)
    target_windows = ws[row_ids]

    # val2
    x2 = target_windows[:, -2]  # target
    y2 = target_windows[:, -2 + DISTANCE]  # neighbors
    val2i = drv.entropy_conditional(x2, y2).item() / drv.entropy(x2).item()

    print(f'{len(ws):>12,} | val1={val1i:.3f} val2={val2i:.2f}')

    return val1i, val2i
Beispiel #2
0
def measure_vars1(mat: np.array, ) -> Tuple[List[float], List[str]]:
    """measure info theory variables"""

    xis, yis = to_x_y(mat)

    mi = drv.information_mutual_normalised(xis, yis, norm_factor='XY')
    xy = np.vstack((xis, yis))
    je = drv.entropy_joint(xy)
    xy = drv.entropy_conditional(xis, yis) / je
    yx = drv.entropy_conditional(yis, xis) / je

    props = [mi, xy, yx]
    names = ['I(X;Y)', 'H(X|Y)', 'H(Y|X)']

    return props, names
Beispiel #3
0
def make_candidate(m: np.array, ) -> Candidate:

    # convert matrix to RVs
    xs, ys = to_pyitlib_format(m)

    u, s, v = randomized_svd(m, n_components=m.shape[1])

    return Candidate(
        matrix=m,
        df=pd.DataFrame(data=to_columnar(m)),  # matrix in df format
        hxy=drv.entropy_conditional(xs, ys).round(2),
        hyx=drv.entropy_conditional(ys, xs).round(2),
        ami=adjusted_mutual_info_score(xs, ys,
                                       average_method="arithmetic").round(2),
        s1p=s[0] / s.sum(),
    )
Beispiel #4
0
    def Hcond(self):
        """
        The conditional entropy matrix (i, j) = H(Xi | Xj).
        A pandas Dataframe or a 2D numpy array depending on dataset type
        """
        if self._Hcond is None:
            # Old attempt to do it ourselves
            # (0) init H
            # nb_vars = len(df.columns)
            # H = np.empty((nb_vars, nb_vars), dtype=float)
            # (1) for each column compute the counts per value
            # (2) for each (i, j) pair compute the counts

            # Using pyitlib to compute H (hopefully efficiently)
            # Unfortunately this does not work with numpy arrays, convert to pandas TODO report
            # note: we convert to string type to avoid a bug with ints. TODO...
            self._Hcond = drv.entropy_conditional(
                self.dataset_df.T.astype(str))

            # add the row/column headers
            if not self.is_nparray:
                self._Hcond = pd.DataFrame(self._Hcond,
                                           index=self.varnames,
                                           columns=self.varnames)

            # basic sanity check: should all be positive
            assert np.all(self._Hcond >= 0)
        return self._Hcond
Beispiel #5
0
def conditional_entropy_in_dct(list_36_gray_img, list_64_gray_img):
    from scipy.fftpack import fft, dct
    from pyitlib import discrete_random_variable as drv

    # Variaveis
    list_36_dct = []
    list_36_conditional_entropy = []
    chain_list_36_dct = []
    chain_list_36_origin = []

    # Converter cada bloco em DCT-II
    for i in range(len(list_36_gray_img)):
        list_36_dct.append(dct(list_36_gray_img[i]))

    for i in range(len(list_36_dct)):
        chain_list_36_dct.append(np.concatenate(list_36_dct[i]).ravel())

    for i in range(len(list_36_gray_img)):
        chain_list_36_origin.append(
            np.concatenate(list_36_gray_img[i]).ravel())

    # Aplicar a Conditional Entropy para cada bloco/slice
    for i in range(len(chain_list_36_dct)):
        list_36_conditional_entropy.append(
            drv.entropy_conditional(chain_list_36_dct[i],
                                    chain_list_36_origin[i],
                                    base=np.exp(2)))

    return list_36_conditional_entropy
    def calculate_weights(self, discretized_data: pd.DataFrame):
        """
        Provide calculation of link strength according mutual information between node and its parent(-s) values.
        """
        import bamt.utils.GraphUtils as gru
        if not all([
                i in ['disc', 'disc_num']
                for i in gru.nodes_types(discretized_data).values()
        ]):
            logger_network.error(
                f"calculate_weghts() method deals only with discrete data. Continuous data: "
                +
                f"{[col for col, type in gru.nodes_types(discretized_data).items() if type not in ['disc', 'disc_num']]}"
            )
        if not self.edges:
            logger_network.error(
                "Bayesian Network hasn't fitted yet. Please add edges with add_edges() method"
            )
        if not self.nodes:
            logger_network.error(
                "Bayesian Network hasn't fitted yet. Please add nodes with add_nodes() method"
            )
        weights = dict()

        for node in self.nodes:
            parents = node.cont_parents + node.disc_parents
            if parents is None:
                continue
            y = discretized_data[node.name].values
            if len(parents) == 1:
                x = discretized_data[parents[0]].values
                LS_true = drv.information_mutual(X=y, Y=x)
                entropy = drv.entropy(X=y)
                weight = LS_true / entropy
                weights[(parents[0], node.name)] = weight
            else:
                for parent_node in parents:
                    x = discretized_data[parent_node].values
                    other_parents = [
                        tmp for tmp in parents if tmp != parent_node
                    ]
                    z = list()
                    for other_parent in other_parents:
                        z.append(list(discretized_data[other_parent].values))
                    LS_true = np.average(
                        drv.information_mutual_conditional(
                            X=y, Y=x, Z=z, cartesian_product=True))
                    entropy = np.average(
                        drv.entropy_conditional(
                            X=y, Y=z, cartesian_product=True)) + 1e-8
                    weight = LS_true / entropy
                    weights[(parent_node, node.name)] = weight
        self.weights = weights
    def theils_u(self, x, y):
        s_xy = drv.entropy_conditional(x, y)
        x_counter = Counter(x)
        total_occurrences = sum(x_counter.values())
        p_x = list(map(lambda n: n / total_occurrences, x_counter.values()))
        s_x = drv.entropy(p_x)

        if s_x == 0:
            return 1

        else:
            return (s_x - s_xy) / s_x
Beispiel #8
0
def compute_norm_cond_entropy_corr(data_df, attrs_from, attrs_to):
    """
    Computes the correlations between attributes by calculating
    the normalized conditional entropy between them. The conditional
    entropy is asymmetric, therefore we need pairwise computation.

    The computed correlations are stored in a dictionary in the format:
    {
      attr_a: { cond_attr_i: corr_strength_a_i,
                cond_attr_j: corr_strength_a_j, ... },
      attr_b: { cond_attr_i: corr_strength_b_i, ...}
    }

    :return a dictionary of correlations
    """
    corr = {}
    # Compute pair-wise conditional entropy.
    for x in attrs_from:
        corr[x] = {}
        for y in attrs_to:
            # Set correlation to 1 for same attributes.
            if x == y:
                corr[x][y] = 1.0
                continue

            xy_df = data_df[[x, y]]
            xy_df = xy_df.loc[~(xy_df[x] == NULL_REPR)
                              & ~(xy_df[y] == NULL_REPR)]
            x_vals = xy_df[x]
            x_domain_size = x_vals.nunique()

            # Set correlation to 0.0 if entropy of x is 1 (only one possible value).
            if x_domain_size == 1 or len(xy_df) == 0:
                corr[x][y] = 0.0
                continue

            # Compute the conditional entropy H(x|y) = H(x,y) - H(y).
            # H(x,y) denotes H(x U y).
            # If H(x|y) = 0, then y determines x, i.e., y -> x.
            # Use the domain size of x as a log base for normalization.
            y_vals = xy_df[y]

            x_y_entropy = drv.entropy_conditional(x_vals,
                                                  y_vals,
                                                  base=x_domain_size).item()

            # The conditional entropy is 0 for strongly correlated attributes and 1 for
            # completely independent attributes. We reverse this to reflect the correlation.
            corr[x][y] = 1.0 - x_y_entropy
    return corr
Beispiel #9
0
def NTE_Measure(x, y, maxlen):
    p = 0
    n = len(y)
    settings = {}
    settings['history_target'] = maxlen
    settings['tau_sources'] = 2
    net = JidtGaussianTE(settings)
    for i in range(30):
        idx = np.random.permutation(1750)
        x_shuffle = x[idx]
        p = p + net.estimate(x_shuffle, y)
    p = p / 30
    texy = net.estimate(x, y)
    info = drv.entropy_conditional(y[1:n], y[0:n - 1], estimator="MINIMAX")
    return (texy - p) / info
Beispiel #10
0
def _jmi(feature_set, labels, score_list):
    ###
    # I(x,y;c) = H(x,c) - H(c) - [ H(x,c,y) - H(c,y) ] + I(y;c) #
    ####
    results=[]
    col = list(feature_set)
    labels = np.reshape(labels, (1, -1))
    for i in col:
        candidate_f = feature_set[i]
        candidate_f = np.reshape(candidate_f.values, (1, -1))
        print(i+ ' start at ' + str(datetime.datetime.now()))
        for j in range(int(i)+1, 400):
            sf = feature_set[str(j)]
            sf = np.reshape(sf.values, (1,-1))

            I_yc = score_list.iloc[j, 0]
            H_x_c = drv.entropy_conditional(candidate_f, labels)

            xcy = np.append([candidate_f, labels], [sf], axis=0)
            H_xcy = drv.entropy_joint(xcy)

            # cy = np.append([labels], [sf], axis=0)
            cy = np.concatenate((labels, sf))
            cy = np.reshape(cy,(-1,2))
            H_cy = drv.entropy_joint(cy)

            I_xy_c = float(H_x_c - (H_xcy - H_cy) + I_yc)
            results.append([[i,j], I_xy_c])

            if I_xy_c < 0:
                print()

            results.append([[i,j], I_xy_c])
            df = pd.DataFrame(results)
            df.to_csv('jmi_2packets.csv')
        print('end at ' + str(datetime.datetime.now()))
    df = pd.DataFrame(results)
    df.to_csv('jmi_2packets.csv')
Beispiel #11
0
        num_entropic_observations = N // fraction
        num_remaining_observations = N - num_entropic_observations

        xs = [np.random.choice(vocab_x, size=N, p=p_x) for _ in range(2)]
        ys = [
            np.hstack((
                np.random.choice(
                    vocab_y[:x_tick], size=num_entropic_observations, p=None
                ),  # no probabilities because these contexts are entropic
                np.random.choice(vocab_y[x_tick:],
                                 size=num_remaining_observations,
                                 p=p_y[x_tick:] / p_y[x_tick:].sum())))
            for _ in range(2)
        ]

        raw = np.mean([drv.entropy_conditional(x, y)
                       for x, y in zip(xs, ys)])  # raw is never biased

        fraction2xys_age1[fraction].append(raw)

# no entropic contexts - and larger shape (simulates age group 2)

if RESPECT_SHAPE_DIFF:
    num_rows_age2 = NUM_ROWS + 30
    num_cols_age2 = NUM_ROWS + 600
else:
    num_rows_age2 = NUM_ROWS + 0
    num_cols_age2 = NUM_ROWS + 0

xs = [np.random.choice(vocab_x, size=N, p=p_x) for _ in range(2)]
ys = [np.random.choice(vocab_y, size=N, p=p_y) for _ in range(2)]
Beispiel #12
0
    # get outcomes - the words that occur in the last 2 slots of probe windows
    x_windows = get_windows(prep, corpus.x, col_id=-3)
    cx, ry, cx_ry = get_outcomes(prep, x_windows)

    # make co-occurrence matrix
    cf_mat = np.ones((corpus.num_y, corpus.num_x))
    for cxi, ryi in zip(cx, ry):
        cf_mat[corpus.y.index(ryi), corpus.x.index(cxi)] += 1

    # make co-occurrence plot
    if SHOW_HEATMAP:
        print(np.max(cf_mat))
        print(np.min(cf_mat))
        fig, ax = make_heatmap_fig(cf_mat)
        ce = drv.entropy_conditional(cx, ry).item()
        je = drv.entropy_joint(cx_ry).item()
        ye = drv.entropy_joint(ry).item()
        plt.title(
            f'Toy Corpus\nH(x-word|y-word)={ce:.4f}\nH(x-word,y-word)={je:.4f}\nH(y-word)={ye:.4f}'
        )
        plt.show()

    # collect singular values for plotting
    cf_mat_intact = scale(cf_mat, axis=1, with_std=False, with_mean=False)
    cf_mat_scaled = scale(cf_mat, axis=1, with_std=False,
                          with_mean=True)  # subtracting mean from rows
    s_intact = np.linalg.svd(cf_mat_intact, compute_uv=False)
    s_scaled = np.linalg.svd(cf_mat_scaled, compute_uv=False)
    s_list_intact.append(np.asarray(s_intact[:NUM_S_DIMS]))
    s_list_scaled.append(np.asarray(s_scaled[:NUM_S_DIMS]))
Beispiel #13
0
def measure_dvs(
    params: Params,
    co_data: CoData,
) -> Dict[str, float]:
    """
    collect all DVs in a single condition.

    a condition is a specific configuration of IV realizations
    """

    res = {}

    co_mat_coo: sparse.coo_matrix = co_data.as_matrix(params.direction)
    co_mat_csr: sparse.csr_matrix = co_mat_coo.tocsr()

    # save for offline analysis
    path_to_pkl = configs.Dirs.co_data / f'co_data_age={params.age}' \
                                         f'_punct={params.punctuation}' \
                                         f'_contr={params.targets_control}' \
                                         f'_lemma={params.lemmas}.pkl'
    with path_to_pkl.open('wb') as f:
        pickle.dump(co_data, f)

    # type and token frequency
    res['x-tokens'] = co_mat_coo.sum().item(
    ) // 2 if params.direction == 'b' else co_mat_coo.sum().item()
    res['x-types'] = co_mat_coo.shape[0]
    res['y-types'] = co_mat_coo.shape[1]

    # normalize columns
    if params.normalize_cols:
        co_mat_csr = normalize(co_mat_csr, axis=1, copy=False)
        print(co_mat_csr.sum())

    # svd
    # don't use sparse svd: doesn't result in accurate reconstruction.
    # don't normalize before svd: otherwise relative differences between rows and columns are lost
    u, s, vt = np.linalg.svd(co_mat_csr.toarray(), compute_uv=True)
    assert np.max(s) == s[0]
    res[f's1/sum(s)'] = s[0] / np.sum(s)
    res[f'frag'] = 1 - (s[0] / np.sum(s))

    # info theory analysis
    if params.direction == 'b':
        xs, ys, zs = co_data.get_x_y_z()
        xyz = np.vstack((xs, ys, zs))
        xyz_je = drv.entropy_joint(xyz)
        nii = drv.information_interaction(xyz).item() / xyz_je
    else:
        nii = np.nan  # need 3 rvs to compute interaction information
    xs, ys = co_data.get_x_y(params.direction)
    xy = np.vstack((xs, ys))
    xy_je = drv.entropy_joint(xy)

    # compute entropy on permuted data for de-biasing estimates
    bias_xy = np.mean([
        drv.entropy_conditional(np.random.permutation(xs),
                                np.random.permutation(ys),
                                base=2).item() for _ in range(10)
    ])
    bias_yx = np.mean([
        drv.entropy_conditional(np.random.permutation(ys),
                                np.random.permutation(xs),
                                base=2).item() for _ in range(10)
    ])
    print(f'bias_xy={bias_xy:.4f}')
    print(f'bias_yx={bias_yx:.4f}')

    xy_ce = drv.entropy_conditional(xs, ys).item()
    yx_ce = drv.entropy_conditional(ys, xs).item()
    res[' xy'] = xy_ce  # biased
    res[' yx'] = yx_ce
    res['dxy'] = bias_xy - xy_ce  # de-biased
    res['dyx'] = bias_yx - yx_ce
    res['nxy'] = xy_ce / xy_je  # biased + normalized
    res['nyx'] = yx_ce / xy_je
    # res['nii'] = nii
    # res['nmi'] = drv.information_mutual_normalised(xs, ys, norm_factor='XY').item()
    # res['ami'] = adjusted_mutual_info_score(xs, ys, average_method="arithmetic")
    # res[' je'] = xy_je

    # round
    for k, v in res.items():
        if isinstance(v, float):
            res[k] = round(v, 3)

    if configs.Fig.max_projection > 0:
        plot_reconstructions(co_mat_coo,
                             params,
                             max_dim=configs.Fig.max_projection)

    # which row or column is most active in projection on first singular dim?
    # note: if lemmas=True, row words may include experimental targets
    # because lemmas of control target plural nouns are singular nouns
    row_words, col_words = co_data.get_words_ordered_by_id(params.direction)
    if len(row_words) != co_mat_csr.shape[0]:
        raise RuntimeError(
            f'Number of row words ({len(row_words)}) != Number of rows ({co_mat_csr.shape[0]})'
        )
    if len(col_words) != co_mat_csr.shape[1]:
        raise RuntimeError(
            f'Number of column words ({len(col_words)}) != Number of columns ({co_mat_csr.shape[1]})'
        )
    projection1 = calc_projection(u, s, vt, 0)
    max_row_id = np.argmax(projection1.sum(axis=1))
    max_col_id = np.argmax(projection1.sum(axis=0))
    print(
        f'Word with largest sum={np.max(projection1.sum(axis=1))} in first projection row="{row_words[max_row_id]}"'
    )
    print(
        f'Word with largest sum={np.max(projection1.sum(axis=0))} in first projection col="{col_words[max_col_id]}"'
    )

    return res
print('joint entropy')
for num_x, num_y in SHAPES:
    res = []
    for _ in range(50):
        xs = np.random.randint(0, num_x, N)
        ys = np.random.randint(0, num_y, N)
        xy = np.vstack((xs, ys))
        res_i = drv.entropy_joint(xy)
        res.append(res_i)
    print(f'{np.mean(res):.4f}')

print('conditional entropy x|y')
for num_x, num_y in SHAPES:
    res = []
    for _ in range(50):
        xs = np.random.randint(0, num_x, N)
        ys = np.random.randint(0, num_y, N)
        res_i = drv.entropy_conditional(xs, ys)
        res.append(res_i)
    print(f'{np.mean(res):.4f}')

print('conditional entropy y|x')
for num_x, num_y in SHAPES:
    res = []
    for _ in range(50):
        xs = np.random.randint(0, num_x, N)
        ys = np.random.randint(0, num_y, N)
        res_i = drv.entropy_conditional(ys, xs)
        res.append(res_i)
    print(f'{np.mean(res):.4f}')
def info_gain(feature_vals: np.ndarray, y_vals: np.ndarray) -> float:
    h_y = drv.entropy(y_vals)
    h_y_given_x = drv.entropy_conditional(y_vals, feature_vals)
    return h_y - h_y_given_x
def get_entropy(dataset, sensitive_attr, top_n=5):

    sensitive_index = dataset.feature_names.index(sensitive_attr)

    res = []

    #Independent entropy
    res.append(drv.entropy(dataset.features[:, sensitive_index]))
    res.append(drv.entropy(dataset.labels[:, 0]))
    entropy_feats = []
    for i in range(0, dataset.features.shape[1]):
        if i == sensitive_index:
            continue
        entropy_feats.append(drv.entropy(dataset.features[:, i]))
    entropy_feats.sort(reverse=True)
    res += entropy_feats[:5]
    res += entropy_feats[-5:]
    #Independent entropy

    #Cross entropy
    res.append(
        drv.entropy_conditional(dataset.features[:, sensitive_index],
                                dataset.labels[:, 0]))
    res.append(
        drv.entropy_conditional(dataset.labels[:, 0],
                                dataset.features[:, sensitive_index]))

    cross_entropy_A = []
    cross_entropy_B = []
    for i in range(0, dataset.features.shape[1]):
        if i == sensitive_index:
            continue
        cross_entropy_A.append(
            drv.entropy_conditional(dataset.features[:, sensitive_index],
                                    dataset.features[:, i]))
        cross_entropy_B.append(
            drv.entropy_conditional(dataset.features[:, i],
                                    dataset.features[:, sensitive_index]))
    cross_entropy_A.sort(reverse=True)
    cross_entropy_B.sort(reverse=True)
    res += cross_entropy_A[:5]
    res += cross_entropy_A[-5:]
    res += cross_entropy_B[:5]
    res += cross_entropy_B[-5:]

    cross_entropy_A = []
    cross_entropy_B = []
    for i in range(0, dataset.features.shape[1]):
        if i == sensitive_index:
            continue
        cross_entropy_A.append(
            drv.entropy_conditional(dataset.labels[:, 0], dataset.features[:,
                                                                           i]))
        cross_entropy_B.append(
            drv.entropy_conditional(dataset.features[:, i], dataset.labels[:,
                                                                           0]))
    cross_entropy_A.sort(reverse=True)
    cross_entropy_B.sort(reverse=True)
    res += cross_entropy_A[:5]
    res += cross_entropy_A[-5:]
    res += cross_entropy_B[:5]
    res += cross_entropy_B[-5:]
    #Cross entropy

    for i in range(0, len(res)):
        res[i] = float(res[i])

    return res
Beispiel #17
0
def _jmim(selected_feature, feature_set, num_to_select, labels, score_list):
    ###
    # I(x,y;c) = H(x|c) - [ H(x,c,y) - H(c,y) ] + I(y;c) #
    ####
    start = datetime.datetime.now()
    col = list(feature_set)
    pool = []
    for i in col:
        candidate_f = feature_set[i]
        candidate_f = np.reshape(candidate_f.values, (1, -1))
        min_jmi = 1000000000
        min_feature = []
        index = 0
        I_xy_c = 0
        for sf_packge in selected_feature:
            # print('round start at ' + str(datetime.datetime.now()))
            sf = sf_packge[1]
            sf_idx = sf_packge[0]

            I_yc = score_list.iloc[sf_idx, 1]

            sf = np.reshape(sf, (1, -1))
            labels = np.reshape(labels, (1, -1))
            H_c = drv.entropy(labels)
            H_x_c = drv.entropy_conditional(candidate_f, labels)

            xcy = np.append([candidate_f, labels], [sf], axis=0)
            H_xcy = drv.entropy_joint(xcy)

            cy = np.append([labels], [sf], axis=0)
            H_cy = drv.entropy_joint(cy)
            H_y_c = drv.entropy_conditional(sf, labels)

            H_cy2 = H_y_c + H_c

            I_xy_c = H_x_c - (H_xcy - H_cy) + I_yc

            labels = np.reshape(labels, (-1, 1))
            if I_xy_c < min_jmi:
                min_jmi = I_xy_c
                min_feature = candidate_f
                index = int(i)
        # print(I_xy_c)
        if I_xy_c < 0:
            print()
        pool.append([index, min_feature, min_jmi])
        # print('round end at ' + str(datetime.datetime.now()))

    max_candidate_score = 0
    max_candidate_idx = 0
    max_candidate = []
    for candidate in pool:
        if float(candidate[2]) > max_candidate_score:
            max_candidate = candidate[1]
            max_candidate_idx = candidate[0]
            max_candidate_score = float(candidate[2])

    selected_feature.append(
        [max_candidate_idx, max_candidate, max_candidate_score])
    feature_set.drop(columns=[str(max_candidate_idx)], inplace=True)

    print(
        str(len(selected_feature)) + ' ' + str(max_candidate_idx) + ' ' +
        str(max_candidate_score) + ' at ' +
        str(datetime.datetime.now() - start))

    return selected_feature, feature_set