def test_mutual_info_classif_mixed(): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. rng = check_random_state(0) X = rng.rand(1000, 3) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0) assert_array_equal(np.argsort(-mi), [2, 0, 1]) for n_neighbors in [5, 7, 9]: mi_nn = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0) # Check that the continuous values have an higher MI with greater # n_neighbors assert mi_nn[0] > mi[0] assert mi_nn[1] > mi[1] # The n_neighbors should not have any effect on the discrete value # The MI should be the same assert mi_nn[2] == mi[2]
def test_mutual_info_classif_discrete(): X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]]) y = np.array([0, 1, 2, 2, 1]) # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly # informative. mi = mutual_info_classif(X, y, discrete_features=True) assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))
def test_mutual_info_classif_mixed(): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. np.random.seed(0) X = np.random.rand(1000, 3) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 mi = mutual_info_classif(X, y, discrete_features=[2], random_state=0) assert_array_equal(np.argsort(-mi), [2, 0, 1])
def test_mutual_info_classif_mixed(): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. np.random.seed(0) X = np.random.rand(1000, 3) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 mi = mutual_info_classif(X, y, discrete_features=[2], random_state=0) assert_array_equal(np.argsort(-mi), [2, 0, 1])
def test_mutual_info_classif_discrete(): X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]]) y = np.array([0, 1, 2, 2, 1]) # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly # informative. mi = mutual_info_classif(X, y, discrete_features=True) assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))
def test_mutual_info_classif_mixed(): # Here the target is discrete and there are two continuous and one # discrete feature. The idea of this test is clear from the code. rng = check_random_state(0) X = rng.rand(1000, 3) X[:, 1] += X[:, 0] y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int) X[:, 2] = X[:, 2] > 0.5 mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3, random_state=0) assert_array_equal(np.argsort(-mi), [2, 0, 1]) for n_neighbors in [5, 7, 9]: mi_nn = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=n_neighbors, random_state=0) # Check that the continuous values have an higher MI with greater # n_neighbors assert_greater(mi_nn[0], mi[0]) assert_greater(mi_nn[1], mi[1]) # The n_neighbors should not have any effect on the discrete value # The MI should be the same assert_equal(mi_nn[2], mi[2])
def get_mutual_information(inputs, targets, token2idx, stop_words = None, mask_token = None): # convert X to CSC format data, row, col = convert_X_to_ijv_format(inputs) counts = csc_matrix((data, (row, col)), shape = (inputs.shape[0], len(token2idx))) # tf_idf_transformer = TfidfTransformer(norm = 'l2', use_idf = True, smooth_idf = True, sublinear_tf = True) # tf_idf_transformer.fit(counts) # counts = tf_idf_transformer.transform(counts) mi = mutual_info_classif(counts, targets) mi[token2idx[mask_token]] = 0.0 for stop_word in stop_words: if stop_word in token2idx: mi[token2idx[stop_word]] = 0.0 print('Maximum mutual information:', np.max(mi)) print('Minimum mutual information:', np.min(mi)) mi += 1e-9 return mi