Beispiel #1
0
def estimate_mutual_info():
    read_path = os.path.join(base_path, "data")
    filelist = os.listdir(read_path)
    for file in filelist:
        read_file = os.path.join(read_path, file)
        if os.path.isdir(read_file):
            continue
        dataset = np.loadtxt(read_file)
        target = dataset[:, -1]
        # feature scaling and normalization
        scaler = preprocessing.StandardScaler(copy=False)
        scaler.fit_transform(dataset[:, :-1])
        normalizer = preprocessing.Normalizer(norm='l2', copy=False)
        normalizer.fit_transform(dataset[:, :-1])
        # mutual_info of local features
        data_local = dataset[:, 10:-1]
        mi = mutual_info_classif(data_local,
                                 target,
                                 'auto',
                                 copy='true',
                                 n_neighbors=3)
        write_path = os.path.join(base_path, "mutual_info_local")
        write_file = os.path.join(write_path, file)
        np.savetxt(os.path.splitext(write_file)[0] + '_m.txt', mi)
        # mutual_info of global features
        data_global = dataset[:, 0:10]
        mi = mutual_info_classif(data_global,
                                 target,
                                 'auto',
                                 copy='true',
                                 n_neighbors=3)
        write_path = os.path.join(base_path, "mutual_info_global")
        write_file = os.path.join(write_path, file)
        np.savetxt(os.path.splitext(write_file)[0] + '_m.txt', mi)
def test_mutual_info_classif_mixed():
    # Here the target is discrete and there are two continuous and one
    # discrete feature. The idea of this test is clear from the code.
    rng = check_random_state(0)
    X = rng.rand(1000, 3)
    X[:, 1] += X[:, 0]
    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
    X[:, 2] = X[:, 2] > 0.5

    mi = mutual_info_classif(X,
                             y,
                             discrete_features=[2],
                             n_neighbors=3,
                             random_state=0)
    assert_array_equal(np.argsort(-mi), [2, 0, 1])
    for n_neighbors in [5, 7, 9]:
        mi_nn = mutual_info_classif(X,
                                    y,
                                    discrete_features=[2],
                                    n_neighbors=n_neighbors,
                                    random_state=0)
        # Check that the continuous values have an higher MI with greater
        # n_neighbors
        assert mi_nn[0] > mi[0]
        assert mi_nn[1] > mi[1]
        # The n_neighbors should not have any effect on the discrete value
        # The MI should be the same
        assert mi_nn[2] == mi[2]
Beispiel #3
0
def MI(x, y, random_state=None):
    """Get mutual information (MI)  between two pandas series.

    x,y: any numeric or categorical vectors.

    Return the mutual information between x and y.

    """
    # pandas's category encodes nan as -1 using integer coding
    is_categorical = lambda x: x.dtype.name == 'category'
    if is_categorical(x) and is_categorical(y):
        return mutual_info_classif(x.cat.codes.values.reshape(-1, 1),
                                   y.cat.codes,
                                   discrete_features=True,
                                   random_state=random_state)[0]
    elif is_categorical(x) and not is_categorical(y):
        return mutual_info_regression(x.cat.codes.values.reshape(-1, 1),
                                      y,
                                      discrete_features=True,
                                      random_state=random_state)[0]
    elif not is_categorical(x) and is_categorical(y):
        return mutual_info_classif(x.values.reshape(-1, 1),
                                   y.cat.codes,
                                   discrete_features=False,
                                   random_state=random_state)[0]
    else:
        # both x and y are numeric
        return mutual_info_regression(x.values.reshape(-1, 1),
                                      y,
                                      discrete_features=False,
                                      random_state=random_state)[0]
    def _calculate_mi(self, df, labels, discrete_feature_mask, seed):
        """Calls the sk-learn implementation of MI and stores results in dict.

    Args:
      df: A pd.DataFrame containing feature values where each column corresponds
        to a feature and each row corresponds to an example.
      labels: A List where the ith index represents the label for the ith
        example.
      discrete_feature_mask: A boolean list where the ith element is true iff
        the ith feature column in the input df is a categorical feature.
      seed: An int value to seed the RNG used in MI computation.

    Returns:
      Dict[FeatureName, Dict[str,float]] where the keys of the dicts are the
      feature name and values are a dict where the keys are
      MUTUAL_INFORMATION_KEY and ADJUSTED_MUTUAL_INFORMATION_KEY and the values
      are the MI and AMI for that feature.
    """
        result = {}
        if self._label_feature_is_categorical:
            mi_per_feature = mutual_info_classif(
                df.values,
                labels,
                discrete_features=discrete_feature_mask,
                copy=True,
                random_state=seed)

            np.random.shuffle(labels)

            shuffled_mi_per_feature = mutual_info_classif(
                df.values,
                labels,
                discrete_features=discrete_feature_mask,
                copy=False,
                random_state=seed)
        else:
            mi_per_feature = mutual_info_regression(
                df.values,
                labels,
                discrete_features=discrete_feature_mask,
                copy=True,
                random_state=seed)

            np.random.shuffle(labels)

            shuffled_mi_per_feature = mutual_info_regression(
                df.values,
                labels,
                discrete_features=discrete_feature_mask,
                copy=False,
                random_state=seed)

        for i, (mi, shuffled_mi) in enumerate(
                zip(mi_per_feature, shuffled_mi_per_feature)):
            result[df.columns[i]] = {
                MUTUAL_INFORMATION_KEY: mi,
                ADJUSTED_MUTUAL_INFORMATION_KEY: mi - shuffled_mi
            }
        return result
def ComputeMIBtwVars(X,Y,rand):
    if(X.shape[1] > 40):
        x1 = X[:,0:7]
        x2 = X[:,7:]
        s1 = mutual_info_classif(x1,Y,discrete_features=False,random_state=rand)
        s2 = mutual_info_classif(x2,Y,discrete_features=True,random_state=rand)
        return np.append(s1,s2)
    else:
        return mutual_info_classif(X,Y,discrete_features=False,random_state=rand)
Beispiel #6
0
def featureselection(x_positives, x_negatives, y_positives, y_negatives):
    cv1 = CountVectorizer(stop_words='english',
                          min_df=2,
                          analyzer='word',
                          token_pattern=r'[a-zA-Z][a-zA-Z][a-zA-Z]*')
    x_pos = cv1.fit_transform(x_positives)

    cv2 = CountVectorizer(stop_words='english',
                          min_df=2,
                          analyzer='word',
                          token_pattern=r'\w\w+')
    x_neg = cv2.fit_transform(x_negatives)

    pos_features = dict(
        zip(cv1.get_feature_names(),
            mutual_info_classif(x_pos, y_positives, discrete_features=True)))
    neg_features = dict(
        zip(cv2.get_feature_names(),
            mutual_info_classif(x_neg, y_negatives, discrete_features=True)))

    cv = CountVectorizer(stop_words='english',
                         min_df=2,
                         analyzer='word',
                         token_pattern=r'[a-zA-Z][a-zA-Z][a-zA-Z]*')
    X = cv.fit_transform(x_positives + x_negatives)
    Y = y_positives + y_negatives
    feats = dict(
        zip(cv.get_feature_names(),
            mutual_info_classif(X, Y, discrete_features=True)))

    best = sorted(pos_features, key=pos_features.get, reverse=True)[:1000]
    worst = sorted(neg_features, key=neg_features.get, reverse=True)[:1000]

    bbest = sorted(feats, key=feats.get, reverse=True)[:1000]

    print('#' * 50)
    print('Best good features')
    print(bbest)
    print('#' * 50)
    # print('Best bad features')
    # print(worst)
    # print('#'*20)

    best_cv = CountVectorizer()
    best_cv.fit([*best, *worst])
    # best_cv.fit(best)
    x = best_cv.transform(x_positives + x_negatives).toarray()

    print('end of feature selection')
    print('#' * 20, )
    return x, best_cv
Beispiel #7
0
def feature_importance_classification(features, target, n_neighbors=3, random_state=None):

    cont = features.select_dtypes(include=[np.floating])
    disc = features.select_dtypes(include=[np.integer, np.bool])

    cont_imp = pd.DataFrame(index=cont.columns)
    disc_imp = pd.DataFrame(index=disc.columns)

    # Continuous features
    if cont_imp.index.size > 0:

        # F-test
        f_test = feature_selection.f_classif(cont, target)
        cont_imp['f_statistic'] = f_test[0]
        cont_imp['f_p_value'] = f_test[1]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(cont, target, discrete_features=False,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        cont_imp['mutual_information'] = mut_inf

    # Discrete features
    if disc_imp.index.size > 0:

        # Chi²-test
        chi2_tests = defaultdict(dict)

        for feature in disc.columns:
            cont = pd.crosstab(disc[feature], target)
            statistic, p_value, _, _ = stats.chi2_contingency(cont)
            chi2_tests[feature]['chi2_statistic'] = statistic
            chi2_tests[feature]['chi2_p_value'] = p_value

        chi2_tests_df = pd.DataFrame.from_dict(chi2_tests, orient='index')
        disc_imp['chi2_statistic'] = chi2_tests_df['chi2_statistic']
        disc_imp['chi2_p_value'] = chi2_tests_df['chi2_p_value']

        # Cramér's V (corrected)
        disc_imp['cramers_v'] = [
            cramers_v_corrected_stat(pd.crosstab(feature, target).values)
            for _, feature in disc.iteritems()
        ]

        # Mutual information
        mut_inf = feature_selection.mutual_info_classif(disc, target, discrete_features=True,
                                                        n_neighbors=n_neighbors,
                                                        random_state=random_state)
        disc_imp['mutual_information'] = mut_inf

    return cont_imp, disc_imp
    def information_gain(self):
        res = dict(
            zip(self.x_train.columns.values,
                mutual_info_classif(self.x_train, self.y_train)))
        print(res)
        objects = self.x_train.columns.values
        y_pos = np.arange(len(objects))
        performance = mutual_info_classif(self.x_train, self.y_train)

        # plt.figure(figsize=(80, 10))
        plt.bar(y_pos, performance, align='center', alpha=0.5)
        plt.xticks(y_pos, objects)
        plt.ylabel('Information Gain')
        plt.title('Programming language usage')
        plt.show()
Beispiel #9
0
def main():
    pos, neg = getpaths()
    pos = [*map(lambda file: BeautifulSoup(open(file, encoding='utf-8'), 'lxml'), pos)]
    neg = [*map(lambda file: BeautifulSoup(open(file, encoding='utf-8'), 'lxml'), neg)]

    pos_docs = []
    for page in pos:
        _as = page.find_all(f)
        temp = ' '.join([*map(getrelevantinfo, _as)])
        pos_docs.append(temp)

    neg_docs = []
    for page in pos:
        _as = page.find_all(f)
        temp = ' '.join([*map(getrelevantinfo, _as)])
        neg_docs.append(temp)

    X = [*pos_docs, *neg_docs]
    Y = [1 for _ in range(100)] + [0 for _ in range(100)]

    cv = CountVectorizer(stop_words='english', min_df=2, analyzer='word', token_pattern=r'[a-zA-Z][a-zA-Z][a-zA-Z]*')
    _X = cv.fit_transform(X)

    selfeat = dict(zip(cv.get_feature_names(), mutual_info_classif(_X, Y, discrete_features=True)))
    print(sorted(selfeat, reverse=True))
def get_scores_by_adding_selected_features(x_train,
                                           y_train,
                                           x_test,
                                           y_test,
                                           order_features,
                                           data_set_name,
                                           model_name=None,
                                           method='shap',
                                           max_features=25,
                                           plot=True):
    plot_name = f'{plots_folder}/{method}/{data_set_name}/scores_by_adding_selected_features_on_model_{model_name}.png'
    title_name = f'{method} data={data_set_name} model={model_name}'
    predictions, features_in_group = _add_selected_features_iteratively_train_and_predict(
        x_train, y_train, x_test, order_features, model_name, max_features)
    # scores = pd.DataFrame([log_loss(y_test, p) for p in predictions], columns=['log_los'])
    scores = pd.DataFrame()
    scores['accuracy'] = pd.Series([
        np.round(accuracy_score(y_test, p.idxmax(axis=1)), 3)
        for p in predictions
    ]).cummax()
    scores['mutual_info'] = pd.Series([
        np.round(
            mutual_info_classif(x_train[features],
                                y_train,
                                n_neighbors=len(features),
                                random_state=seed).sum(), 3)
        for features in features_in_group
    ]).cummax()
    scores.index.name = 'number of features'

    if plot:
        fig = px.line(scores, title=title_name)
        fig.update_traces(mode="markers+lines")
        fig.write_image(plot_name)
    return predictions, features_in_group
Beispiel #11
0
def mutual_info(data):
    # 缺失值填补
    df = data.fillna(method = 'ffill')
    # 将时间变量转换为数值
    df['hospitalizeddate'] = ((df['hospitalizeddate'] - datetime.datetime(2008, 12, 31))/ np.timedelta64(1, 'D')).astype(int)
    df['leavedate'] = ((df['leavedate'] - datetime.datetime(2008, 12, 31)) / np.timedelta64(1, 'D')).astype(int)
    df_1 = df.copy()
    # 将字符串变量转化为数值
    str_cols = df_1.columns[df_1.dtypes == 'object']
    for col in str_cols:
        df_1[col] = LabelEncoder().fit_transform(df_1[col])
    # 计算互信息
    y = df_1['label']
    x = df_1.drop('label', axis=1)
    info = mutual_info_classif(x, y, copy=True, random_state=5)
    map = {}
    name = x.columns
    for na, info in zip(name, info):
        map[na] = info

    lista = []
    for key, value in map.iteritems():
        # print([key, value])
        if value < 0.05:
            lista.append(key)
    print(lista)
    # 去掉了与y值完全无关的变量
    df_2 = df.drop(lista, axis=1)
    return df_2
def get_features(raw_data, raw_ids, debug, run):
    """
    Calculate the information gain of a dataset. This function takes three parameters:
    Computes the avg mutual information and uses it as threshold for eliminating
    features
    """

    # Create a classifier for the data
    m_info = mutual_info_classif(raw_data, raw_ids)

    # Get the average of the mutual information of each column
    avg = np.mean(m_info)

    # Set aside columns with more info than avg
    return_columns = []
    index = 0
    for feature in m_info:
        if feature >= avg:
            return_columns.append(index)
        index += 1

    debug += "information gain with avg: " + str(avg) + "\n"
    debug += "INFORMATION GAIN: Suggesting: " + str(
        len(return_columns)) + " columns out of " + str(len(
            raw_data.columns)) + "\n"
    return return_columns, debug
Beispiel #13
0
def mutual_info_categorical(dataframe):
    n_feature = len(dataframe.columns) - 1
    mi = mutual_info_classif(
        dataframe[dataframe.columns[:n_feature]],
        np.array(dataframe[dataframe.columns[n_feature:n_feature +
                                             1]]).reshape(-1))
    return mi
Beispiel #14
0
def FS_IG(X_train, Y_train, X_test):
    """
    Feature selection using FS_IG

    Args:
        X (numpy array): aCGH data
        Y (numpy array): diagnosis data

    Returns:
        X_fil: filtered dataframe
    """

    # gets the gains vector
    gain_vec = mutual_info_classif(X_train, Y_train, discrete_features=True)

    # gets the indices of columns that can be deleted from the dataset
    delete_ind = gain_vec.argsort()[::-1][N_FEATURES:]
    for i in range(len(gain_vec)):
        if i not in delete_ind:
            CHOSEN_FEAUTURES.append(i)
    # deletes the features that can be deleted
    X_train_fil = np.delete(X_train, delete_ind, 1)
    X_test_fil = np.delete(X_test, delete_ind, 1)

    return X_train_fil, X_test_fil
Beispiel #15
0
 def mutual_info_selection(self, X, y):
     # Wrapping sklearn mutual info clf enabling parameter config.
     return mutual_info_classif(X,
                                y,
                                discrete_features=False,
                                n_neighbors=self.num_neighbors,
                                random_state=self.random_state)
Beispiel #16
0
 def mutual_info_select(self, F, y, threshold):
     mi = list(enumerate(mutual_info_classif(F, y)))
     f_best = []
     for (ind, rank) in mi:
         if rank > threshold:
             f_best.append(ind)
     return f_best
Beispiel #17
0
 def compute_scoring_func(self, func):
     if func == 'variance':
         features = self.instances.features.get_values()
         annotations = self.instances.annotations.get_labels()
         return features.var(axis=0), None
     features = self.annotated_instances.features.get_values()
     annotations = self.annotated_instances.annotations.get_labels()
     if func == 'f_classif':
         return f_classif(features, annotations)
     elif func == 'mutual_info_classif':
         features_types = self.instances.features.info.types
         discrete_indexes = [
             i for i, t in enumerate(features_types)
             if t == FeatureType.binary
         ]
         if not discrete_indexes:
             discrete_indexes = False
         return (mutual_info_classif(features,
                                     annotations,
                                     discrete_features=discrete_indexes),
                 None)
     elif func == 'chi2':
         return chi2(features, annotations)
     else:
         assert (False)
Beispiel #18
0
def GaIn(data):
    new_data = ""
    len = data.shape[1]
    taille = len / 4
    X_train = data.iloc[:, 0:len - 1]
    Y_train = data.iloc[:, len - 1]
    Y_train_ohe = pd.DataFrame(OneHotEncoder().fit_transform(
        Y_train.values.reshape(-1, 1)).toarray())
    GI = np.zeros((len - 1, ), dtype='f')
    #on calclule la moyenne ces coefficients
    for c in Y_train_ohe:
        GI += mutual_info_classif(X_train,
                                  Y_train_ohe[c]) / Y_train_ohe.shape[1]

    resultat = pd.DataFrame(GI, index=X_train.columns)
    resultat = resultat.sort_values(by=0, ascending=False)
    print("\n Le classement des features en fonction du gain d'information:")
    print(resultat)
    nbre_ss_ens = input("Entrez le nombre de features à selectionner: ")
    if (IsInt(nbre_ss_ens)):
        taille = int(nbre_ss_ens)
    new_data = pd.concat([X_train[resultat.index[0:taille]], Y_train],
                         axis=1,
                         sort=False)
    new_data.to_csv("./workspace/datax/datafeaGI.csv", index=False)
    print(new_data.head())
    return new_data
Beispiel #19
0
    def compute_scoring_func(self, func):
        if func == 'variance':
            features = self.instances.features.get_values()
            annotations = self.instances.annotations.get_labels()
            if isinstance(features, spmatrix):
                variance = mean_variance_axis(features, axis=0)[1]
            else:
                variance = features.var(axis=0)
            return variance, None

        features = self.annotated_instances.features.get_values()
        annotations = self.annotated_instances.annotations.get_supervision(
            self.multiclass)
        if func == 'f_classif':
            return f_classif(features, annotations)
        elif func == 'mutual_info_classif':
            if isinstance(features, spmatrix):
                discrete_indexes = True
            else:
                features_types = self.instances.features.info.types
                discrete_indexes = [
                    i for i, t in enumerate(features_types)
                    if t == FeatureType.binary
                ]
                if not discrete_indexes:
                    discrete_indexes = False
            return (mutual_info_classif(features,
                                        annotations,
                                        discrete_features=discrete_indexes),
                    None)
        elif func == 'chi2':
            return chi2(features, annotations)
        else:
            assert (False)
def getMutualInfo(area1, area2, area3, area4, monkey):
    feature1 = getAllFeaturesVector(area1)
    feature2 = getAllFeaturesVector(area2)
    feature3 = getAllFeaturesVector(area3)
    feature4 = getAllFeaturesVector(area4)
    feature_matrix = np.concatenate((feature1, feature2, feature3, feature4))

    area1_y = [0] * len(area1)
    area2_y = [1] * len(area2)
    area3_y = [2] * len(area3)
    area4_y = [3] * len(area4)
    areas = area1_y + area2_y + area3_y + area4_y

    mi = mutual_info_classif(feature_matrix, areas, discrete_features=False)
    index1 = np.argmax(mi)
    old = mi[index1]
    mi[index1] = -1
    index2 = np.argmax(mi)

    mi[index1] = old

    plt.scatter(range(len(mi)), mi)
    plt.xlabel("feature index")
    plt.ylabel("mutual info")
    plt.title(monkey)
    plt.show()

    print(index1)
    print(index2)
Beispiel #21
0
def get_feature_importances(X, y, importance_method='rf'):
    # X = X.drop(columns=additional_cols_to_drop)

    # selector = f_classif(X, y)
    # selector.fit(X, y)
    # scores = -np.log10(f_classif(X, y)[0])
    # return scores / max(scores)
    # X = X.drop(columns=additional_cols_to_drop)
    if importance_method == 'lda':
        lda = LinearDiscriminantAnalysis()
        lda.fit(X, y)
        scores = lda.coef_[0]
        importances = scores
    elif importance_method == 'rf':
        # return scores
        clf = ExtraTreesClassifier(n_estimators=500)
        clf.fit(X, y)
        importances = clf.feature_importances_
    elif importance_method == 'mutual_info':
        importances = feature_selection.mutual_info_classif(X, y, True)
    # Standardize importances
    importances = (importances-importances.min()) / (importances.max()-importances.min())
    # importances = StandardScaler().fit_transform([importances])
    # return pd.DataFrame(list(zip(X.columns, importances)), columns=['name', 'importance'])\
    importances = pd.DataFrame({'importance': pd.Series(importances, index=X.columns)})

    importances = importances.sort_values('importance', ascending=False)
    importances.importance_method = importance_method
    return importances
Beispiel #22
0
    def _fit_language(self, X_unmapped: Sequence[str], X: Sequence[str],
                      Y: np.ndarray):
        cv = CountVectorizer(
            max_df=0.95,
            min_df=2,
            lowercase=False,
            ngram_range=(1, self.hyperparams.max_ngram),
            max_features=(self.hyperparams.max_vocab * 18),
            token_pattern='[a-zA-Z0-9$&+,:;=?@_/~#\\[\\]|<>.^*()%!-]+')

        X_vec = cv.fit_transform(trivial_generator(X))

        local_vocab = set()
        for feat in Y.columns:
            res = zip(
                cv.get_feature_names(),
                mutual_info_classif(X_vec, Y[feat], discrete_features=True))
            local_vocab.update(res)
        self.vocab = {
            i[0]
            for i in sorted(local_vocab, key=lambda i: i[1], reverse=True)
            [:self.hyperparams.max_vocab]
        }

        self._analyzer = cv.build_analyzer()
def get_features(raw_data, raw_ids):

    """
    Calculate the information gain of a dataset. This function takes three parameters:
    1. data = The dataset for whose feature the IG should be calculated
    2. split_attribute_name = the name of the feature for which the information gain should be calculated
    3. target_name = the name of the target feature. The default for this example is "class"
    """
    df = pd.DataFrame(raw_data)
    df["person"] = raw_ids

    return_columns = []
    cv = CountVectorizer(max_df=1, min_df=1,
                         max_features=72, stop_words='english')
    for column in df:
        if column != "person":
            X = df[column].astype(str)
            Y = df["person"].astype(str)
            X_vec = cv.fit_transform(X)
            ig = mutual_info_classif(X_vec, Y, discrete_features=True)
            avg = sum(ig)
            if avg > .5 and column != "person":
                return_columns.append(column)

    return return_columns
def filter_methods_classification(X, y, feat_names, rotation=False):

    angle = 0
    if rotation:
        angle = 90

    # do calculations
    f_test, _ = f_classif(X, y)
    f_test /= np.max(f_test)

    mi = mutual_info_classif(X, y)
    mi /= np.max(mi)

    # do some plotting
    plt.figure(figsize=(20, 4))

    plt.subplot(1, 2, 1)
    plt.bar(range(X.shape[1]), f_test, align="center")
    plt.xticks(range(X.shape[1]), feat_names, rotation=angle)
    plt.xlabel('features')
    plt.ylabel('Ranking')
    plt.title('$F-test$ score')

    plt.subplot(1, 2, 2)
    plt.bar(range(X.shape[1]), mi, align="center")
    plt.xticks(range(X.shape[1]), feat_names, rotation=angle)
    plt.xlabel('features')
    plt.ylabel('Ranking')
    plt.title('Mutual information score')

    plt.show()
def calculate_domain_mutual_info_scores(X_domain_matrix, y_labels):
	domain_mutual_info_scores = mutual_info_classif(X_domain_matrix, y_labels,random_state=8)
	domain_mutual_info_scores = dict(zip(X_domain_matrix.columns, domain_mutual_info_scores))
	sorted_domain_mutual_info_scores = sorted(domain_mutual_info_scores.items(), key=operator.itemgetter(1))
	sorted_domain_mutual_info_scores = pd.DataFrame(sorted_domain_mutual_info_scores)
	sorted_domain_mutual_info_scores.columns = ['domainID', 'MI-score']
	return(sorted_domain_mutual_info_scores)
def mutual_info(X, y):
    print('mutual information')
    features_names = X.columns.values.tolist()
    mi_score = mutual_info_classif(X, y)
    wscores = zip(features_names, mi_score)
    wmi = sorted(wscores, key=lambda x: x[1], reverse=True)
    print(wmi[:14])
def mutual_info(matrix, window=50):
    n = np.shape(matrix)[1]
    MImat = np.zeros((n, n))
    periods = []
    for i in range(0, int(np.floor(np.shape(matrix)[0] / window) - 1)):
        periods.append([i * window, (i + 1) * window])

    for i in range(0, n):
        for j in range(0, n):
            MI = []
            for p in periods:
                # print("i={}, j={},p={}".format(i,j,p))
                if i != j:
                    info = mutual_info_classif(matrix[p[0]:p[1], i].reshape(
                        -1, 1),
                                               matrix[p[0]:p[1], j],
                                               n_neighbors=5,
                                               discrete_features=True) * 1000
                    MI.append(info)
                else:
                    MI.append(0)

                # print("{},{}: {}".format(i,j,MI))
            MImat[i, j] = (np.quantile(MI, .75))
    return (MImat)
    def rmi(self,
            X_train,
            X_test,
            y_train,
            feat_names,
            min_rmi=None,
            retain_ratio=None,
            **kwargs):
        top_n = int(retain_ratio * len(feat_names))
        if y_train.dtype != int:
            le = LabelEncoder()
            y_train = le.fit_transform(y_train).astype(int)

        entropy = self.discrete_entropy(y_train)
        rmi = np.array([
            i / entropy for i in list(
                feature_selection.mutual_info_classif(
                    X_train, y_train, random_state=0))
        ])

        sorted_rmi_i = np.argsort(rmi)[::-1]
        rmi = rmi[sorted_rmi_i]
        feat_names_sorted = np.array(feat_names)[sorted_rmi_i].tolist()

        indexes_to_retain = np.argwhere(
            rmi >= min_rmi).flatten() if min_rmi is not None else range(top_n)

        which_features = [feat_names_sorted[i] for i in indexes_to_retain]
        df = X_train.copy()

        print("### RMI ###")
        for i, feat in enumerate(feat_names_sorted):
            print(feat, rmi[i])

        return df[which_features]
Beispiel #29
0
    def fit(self, X_unmapped, X, Y, max_vocab=18000,
            max_features_to_test=180000, window=8, dims=32, max_ngram=5):
        cv = CountVectorizer(
            max_df=0.95, min_df=2, lowercase=False, ngram_range=(1, max_ngram),
            max_features=max_features_to_test,
            token_pattern='[a-zA-Z0-9$&+,:;=?@_/~#\\[\\]|<>.^*()%!-]+')

        X_vec = cv.fit_transform(self._smiles_to_trivial_lang(X))

        local_vocab = set()
        for feat in Y.columns:
            res = zip(cv.get_feature_names(),
                      mutual_info_classif(
                          X_vec, Y[feat], discrete_features=True)
                      )
            local_vocab.update(res)
        self.vocab = {i[0] for i in sorted(
            local_vocab, key=lambda i: i[1], reverse=True)[:max_vocab]}

        self._analyzer = cv.build_analyzer()

        generator = self._make_iterator(X_unmapped, training=True)

        document_model = Doc2Vec(
            vector_size=dims, workers=cpu_count(), window=window)
        document_model.build_vocab(generator)
        document_model.train(
            generator, total_examples=len(X_unmapped), epochs=36)

        self.document_model = document_model
Beispiel #30
0
def select_feature(x_train, x_test, y_train):
    """
	This function reduces the number of features from the existing 
	g.t 10,000 to something manageable.
	Based on experience with feature selection in homework 1, we do
	not expect the selection to result in improved performance. But
	we expect a reduction in run-time.

	No feature Run Time 
	GPA : 320.58s
	Grit : 280.71
	Hardship : 288.05
	layoff : 37.22

	Note : Code taken as is from homework 1 submission
	"""
    # feature selction-mutual info
    MIC = []
    # Mutual info criteria
    MIC = feature_selection.mutual_info_classif(x_train, y_train)
    # get most descriptive features (here called good features)
    good_features = []
    for k in range(len(MIC)):
        if MIC[k] > 0.1:  # Criteria for deciding that feature should be included
            good_features.append(k)
    # Adapt the training and testing matrices to good features
    x_train = x_train[:, good_features]
    x_test = x_test[:, good_features]
    print(len(good_features))
    return x_train, x_test
Beispiel #31
0
 def feature_selection(self,mode='F'):
     
     print 'Feature Selection...'
     print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     X=self.train.copy()
     y=self.train_label['label'].values.copy()
     
     test=self.test.copy()
     
     if mode.upper()=='M':
         mi=mutual_info_classif(train.values,train_label['label'].values)
     elif mode.upper()=='F':
         F,pval=f_classif(train.values,train_label['label'].values)
     elif mode.upper()=='C':
         chi,pval=chi2(train.values,train_label['label'].values)
     
     features=self.train.columns.copy()
     
     fs_features=features.copy().tolist()
     
     if mode.upper()=='M':
         fs_V=mi.copy().tolist()
     elif mode.upper()=='F':
         fs_V=F.copy().tolist()
     elif mode.upper()=='C':
         fs_V=chi.copy().tolist()
     
     if mode.upper()=='M':
         selector=SelectPercentile(mutual_info_classif,percentile=80)
     elif mode.upper()=='F':
         selector=SelectPercentile(f_classif,percentile=80)
     elif mode.upper()=='C':
         selector=SelectPercentile(chi2,percentile=80)
         
     X_new=selector.fit_transform(X,y)
     
     selected=selector.get_support()
     
     for i in xrange(len(features)):
         if selected[i]==False:
             t=features[i]
             fs_features.remove(t)
             
     fs_V=np.array(fs_V)
     fs_features=np.array(fs_features)
     
     self.train=pd.DataFrame(X_new,columns=fs_features.tolist())
     self.test=test[fs_features]
     
     self.fs_features=fs_features
     
     feas=pd.DataFrame()
     feas['feature']=fs_features
     
     print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     return X_new,feas
def select_sized_features(feature_size,fearure_vecotr,feature_indecies,y,feature_selection_measure):
    if feature_selection_measure == SelectionMeasure.chi_2:
        feature_values,p_value = chi2(fearure_vecotr,y)
    elif feature_selection_measure == SelectionMeasure.f:
        feature_values,p_value = f_classif(fearure_vecotr,y)
    else:
    # elif feature_selection_measure == SelectionMeasure.mutual_info:
        feature_values = mutual_info_classif(fearure_vecotr,y)

    feature_value_id_map = {}
    for i in range(len(feature_values)):
        feature_value_id_map[ feature_indecies[i] ] = feature_values[i]

    sorted_features = sorted(feature_value_id_map.items(),key=lambda x:x[1],reverse=True)
    selected_features = []
    for i in range(feature_size):
        if i >= len(sorted_features):
            continue
        selected_features.append( sorted_features[i][0] )

    return selected_features
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("feature_data_dir")
    parser.add_argument('--method','-m',type=int,default=1,choices=range(5),
        help=
        """choose methods from:
                0:linear_svc
                1:logistic regression
                2:naive bayes
                3:decision  tree
                4:random forest
        """)
    parser.add_argument('--feature_selection_measure','-fm',type=int,default=0,choices=list(map(int, SelectionMeasure)),
        help=
        """choose feature selection measure from:
                0:chi2
                1:f_classif
                2:mutual_info_classif
        """)
    parser.add_argument("--use_stanford_type","-us",action='store_true',
        help = 
        """When specified, the type information of Stanford NER
        is used as features
        """
    )
    parser.add_argument("--use_category","-uc",action='store_true',
        help = 
        """When specified, the category information from wikidata
        is used as features
        """
    )
    parser.add_argument("--no_words","-nw",action='store_true',
        help = 
        """When specified, no word features will be used
        """
    )
    parser.add_argument("--set_feature_size","-sf",type=int)
    args=parser.parse_args()

    feature_data = load_data_set(args.feature_data_dir)
    labels = []
    for single_data in feature_data:
        labels.append(single_data["judgement"])
    # chi2_values, pval = chi2(word_vector,label)
    # # print "There are %d chi values" %(len(chi2_values))
    # feature_value_id_map = {}
    # for i in range(len(chi2_values)):
    #     feature_value_id_map[ i] = chi2_values[i]

    # sorted_features = sorted(feature_value_id_map.items(),key=lambda x:x[1],reverse=True)
    # print "There are %d sorted_features" %(len(sorted_features))
    # print sorted_features
    if args.set_feature_size is not None:
        feature_size_vector = [args.set_feature_size]
    else: 
        feature_size_vector = [i*100 for i in range(1,40)]

    best_f1 = -1
    best_size = 0
    recall_atm = 0
    precision_atm = 0

    args.feature_selection_measure = SelectionMeasure(args.feature_selection_measure)
   
    for feature_size in feature_size_vector:
        print "For size %d" %(feature_size)

        clf = get_classifier(args.method)
        f1_vector = []
        skf = StratifiedKFold(labels,n_folds=5,shuffle=True)
        for train, test in skf:
            test_X = []
            test_y = []
            #select word features
            sub_feature_data = []
            for i in train:
                sub_feature_data.append(feature_data[i])
                

            train_y,sub_word_indecies, sub_categories,sub_word_vector = prepare_data(sub_feature_data)

            
            if args.feature_selection_measure == SelectionMeasure.chi_2:

                feature_values, pval = chi2(sub_word_vector,train_y)
            elif args.feature_selection_measure == SelectionMeasure.f:
                feature_values, pval = f_classif(sub_word_vector,train_y)
            else:
                feature_values = mutual_info_classif(sub_word_vector,train_y)
            feature_value_id_map = {}
            for i in range(len(feature_values)):
                feature_value_id_map[ sub_word_indecies[i] ] = feature_values[i]

            sorted_features = sorted(feature_value_id_map.items(),key=lambda x:x[1],reverse=True)
            chosen_words = []
            for i in range(feature_size):
                if i >= len(sorted_features):
                    continue
                chosen_words.append( sorted_features[i][0] )

            X_new = []


            # add category and type features if needed
            for k in train:
                single_x = []
                single_data = feature_data[k]
                if not args.no_words:
                    for w in chosen_words:
                        if w in single_data["word_features"]:
                            single_x.append(single_data["word_features"][w])
                        else:
                            single_x.append(0)
                if args.use_category:
                    if single_data["category"]:
                        for c in sub_categories:
                            if c in single_data["category"]:
                                single_x.append(1)
                            else:
                                single_x.append(0)
                    else:
                        single_x += [0]*len(sub_categories)

                if args.use_stanford_type:
                    if "ORGANIZATION" in single_data["type"]:
                        single_x.append(1)
                    else:
                        single_x.append(0)
                    if "LOCATION" in single_data["type"]:
                        single_x.append(1)
                    else:
                        single_x.append(0)
                X_new.append(single_x)

            for k in test:
                single_x = []
                single_data = feature_data[k]
                if not args.no_words:
                    for w in chosen_words:
                        if w in single_data["word_features"]:
                            single_x.append(single_data["word_features"][w])
                        else:
                            single_x.append(0)
                if args.use_category:
                    if single_data["category"]:
                        for c in sub_categories:
                            if c in single_data["category"]:
                                single_x.append(1)
                            else:
                                single_x.append(0)
                    else:
                        single_x += [0]*len(sub_categories)

                if args.use_stanford_type:
                    if "ORGANIZATION" in single_data["type"]:
                        single_x.append(1)
                    else:
                        single_x.append(0)
                    if "LOCATION" in single_data["type"]:
                        single_x.append(1)
                    else:
                        single_x.append(0)
                test_X.append(single_x)
                test_y.append(labels[k])
            
            clf.fit(X_new,train_y)
            predicted_y = clf.predict(test_X)
            f1_vector.append(f1_score(test_y,predicted_y))




        average_f1 = sum(f1_vector)/(1.0*len(f1_vector))
        print "Average: %f" %(average_f1)
        if average_f1 > best_f1:
            best_f1 = average_f1
            best_size = feature_size
            
        print "-"*20

    print "best f1 is %f achieved by size %d" %(best_f1,best_size)
As our features in this probelm are discrete, I will use Scikit-learn’s mutual_info_classif class with the discrete_features=True flag:
"""

import numpy as np
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

datafile = '/model_data/training_data_newfeatures.csv'
data = pd.read_csv(datafile, delimiter='\t', dtype=str)
l_features = list(data.columns)

discrete_dataset = np.loadtxt(datafile, dtype=str, delimiter='\t')
X = discrete_dataset[:, :-1]
y = discrete_dataset[:, -1]

l_importance = mutual_info_classif(X, y, discrete_features=True)

resdict = {}
for i, res in enumerate(l_importance):
	
	# exclude data which is 1:1 with the job feature (as these will not be useful)
	if l_features[i] in ['... list of features ...']:
		continue
	resdict[l_features[i]] = res
print 'MI:'
for elt in sorted(resdict.items(), key=lambda x: x[1], reverse=True):
    print elt

print '\n'

resdict2 = {}