Example #1
0
def featureSelection(reduced_features,
                     labels,
                     clnd_features,
                     percentile,
                     n_components,
                     results=False):
    """
        Parameters: 
            reduced_features = Unique feature names in python list after dropping non-numeric
            feaures. 
            labels = ground truth labels for the data points.
            clnd_features = data point features in numpy array format corresponding
            to the labels.
            percentile= the parameter for the SelectPercentile method;
            between 0.0-1.0.
            n_components = the n_components for the pca. 
            results = False returns python list of selected features. If True
            returns the metrics of the feature selectors (F-statistic, and p-values from
            f_classif) and the top 'n' pca component variance measurements. 
    
        Output: 
           Resulting list of feature from the SelectPercentile function and the 
           number of principle components used. If p_results = True then the 
           statistics of the SelectPercentile method using f_classif will be printed.
           In addition the explained variance of the top 'x' principle components will
           also be printed.
    """
    from sklearn.feature_selection import SelectPercentile, f_classif
    from sklearn.decomposition import PCA
    from itertools import compress

    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit_transform(clnd_features, labels)

    pca = PCA(n_components=n_components)
    pca.fit_transform(clnd_features, labels)

    if results == True:

        f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\
                         key = lambda x: x[1], reverse=True)

        p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\
                        key = lambda x: x[1])

        expl_var = pca.explained_variance_ratio_

        return f_stat, p_vals, expl_var
    else:
        ## return a boolean index of the retained features
        retained_features = selector.get_support()

        ## index the original features by the boolean index of top x% features
        ## return a python list of the features to be used for training
        features_list = list(compress(reduced_features[1:], retained_features))

        ## add back in the 'poi' to the first position in the final features list
        features_list.insert(0, 'poi')

        return features_list
Example #2
0
def test_f_classif_constant_feature():
    # Test that f_classif warns if a feature is constant throughout.

    X, y = make_classification(n_samples=10, n_features=5)
    X[:, 0] = 2.0
    with pytest.warns(UserWarning):
        f_classif(X, y)
def test_f_classif():
    # Test whether the F test yields meaningful results
    # on a simple simulated classification problem
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    F, pv = f_classif(X, y)
    F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y)
    assert_true((F > 0).all())
    assert_true((pv > 0).all())
    assert_true((pv < 1).all())
    assert_true((pv[:5] < 0.05).all())
    assert_true((pv[5:] > 1.0e-4).all())
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)
def test_f_classif():
    """
    Test whether the F test yields meaningful results
    on a simple simulated classification problem
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    F, pv = f_classif(X, y)
    F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y)
    assert (F > 0).all()
    assert (pv > 0).all()
    assert (pv < 1).all()
    assert (pv[:5] < 0.05).all()
    assert (pv[5:] > 1.e-4).all()
    assert_array_almost_equal(F_sparse, F)
    assert_array_almost_equal(pv_sparse, pv)
Example #5
0
def SignificanceMatrix(data):
    col = data.columns
    colTypes = [ check_type(x) for x in data.dtypes ]
    relationMatrix = pd.DataFrame(index=col,columns=col)

    for i in range(len(col)):
        for j in range(i, len(col)):
            if i==j:
                pval = 1
                relationMatrix.loc[col[i],col[j]] = pval
            else:
                tempdata = data[[col[i],col[j]]]
                tempdata = tempdata.dropna(axis=0)   #Remeber to add warning where missing data is removed
                col1 = tempdata[col[i]]
                col2 = tempdata[col[j]].ravel()
                # print tempdata.dtypes
                # print colTypes[i],colTypes[j]
                if colTypes[i] == colTypes[j]:
                    if colTypes[i] == "continuous":
                        # print "both cont"
                        pval = np.round(feature_selection.f_regression(pd.DataFrame(col1),col2)[1][0],3)
                    else:
                        pval = chisq_independence(tempdata[col[i]],tempdata[col[j]])                        
                else:
                    if colTypes[i] == "continuous":
                        pval = np.round(feature_selection.f_classif(pd.DataFrame(col1),col2)[1][0],3)
                    else:
                        pval = np.round(feature_selection.f_classif(pd.DataFrame(col2),col1)[1][0],3)
                relationMatrix.loc[col[i],col[j]] = pval
                relationMatrix.loc[col[j],col[i]] = pval

    return relationMatrix.fillna("NAN")
Example #6
0
def compute_fscore(data_set_df, user_info_df, label='gender', min_not_nan=-1):
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    feature_fs = DataFrame(np.zeros(len(df_filtered)), index=df_filtered.index, columns=['importance'])
    i = 0
    for index, values in df_filtered.iterrows():
        try:
            if min_not_nan < 0:
                f_score, p_val = f_classif(values.fillna(values.mean())[:, np.newaxis], y_v)
                feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            else:
                nan_removed = values.dropna()
                if len(nan_removed) < min_not_nan:
                    feature_fs.loc[index] = np.nan
                else:
                    f_score, p_val = f_classif(nan_removed[:, np.newaxis], y_v[nan_removed.index.astype(int)])
                    feature_fs.loc[index] = f_score if f_score != np.inf and f_score != -np.inf else np.nan
            if float(i) % 10000 == 0 and i > 0:
                print "\t\t\t%s features are done" % i
            i += 1
            # print index, feature_fs.loc[index].values[0]
        except ValueError:
            # print "value error occurs during processing %r" % index
            continue
    feature_fs.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_fs
Example #7
0
def perform_ANOVA(blind=False):
    if blind == False:
        fval, pval = f_classif(res.data, res.target)
    else:
        fval, pval = f_classif(b_res.data, b_res.target)
    print('Attribute,f-value,p-value')
    for i in range(len(pval)):
        print(res.attributes[i + 3] + ',' + str(fval[i]) + ',' + str(pval[i]))
def FStat(vals, labels):
    vals = np.array(vals)
    if len(np.shape(vals)) == 1:
        (f, p) = fs.f_classif(vals.reshape(-1, 1), labels)
        return f[0]
    else:
        (f, p) = fs.f_classif(vals, labels)
        return f
Example #9
0
def main():
    # parse params:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_path',
        default='./Skipthoughts-2018_01_13-13_11_19-13.726/model.pt',
        type=str)
    parser.add_argument(
        '--books_path',
        default='/Users/mike/GitRepos/potter/data/other/books_txt_full',
        type=str)
    parser.add_argument('--tokenize', action='store_true')
    parser.add_argument(
        '--dict_path',
        default='./Skipthoughts-2018_01_13-13_11_19-13.726/model.dict.pt',
        type=str)
    parser.add_argument('--gpu', action='store_true')

    parser.add_argument('--max_sent_len', default=25, type=int)
    parser.add_argument('--min_sent_len', default=5, type=int)
    parser.add_argument('--sents_per_book', default=100, type=int)
    parser.add_argument('--books_per_genre', default=20, type=int)

    parser.add_argument('--nb_top_features', default=5, type=int)

    args = parser.parse_args()

    # load model and dict
    #model = u.load_model(args.model_path)
    #vocab_dict = u.load_model(args.dict_path)

    data = make_dataframe(args, model=None, vocab=None)

    X = data.filter(regex='neur')

    print('=======\n single-neuron binary classification (one vs rest):')
    for genre in set(data['genre']):
        print(f'  Testing genre {genre}:')
        y = [int(i) for i in data.genre == genre]
        # univariate feature selection with F-test for feature scoring
        F, pval = f_classif(X, y)
        max_idxs = np.argsort(F)[::-1][:args.nb_top_features]
        neuron_names, neuron_f_scores = np.array(
            X.columns)[max_idxs], F[max_idxs]

        for name, score in zip(neuron_names, neuron_f_scores):
            print(f'      {name} -> {score:.2f} F-score')

    # categorical case:
    print('=======\n single-neuron binary genre classification:')
    F, pval = f_classif(X, data['genre'])
    max_idxs = np.argsort(F)[::-1][:args.nb_top_features]
    neuron_names, neuron_f_scores = np.array(X.columns)[max_idxs], F[max_idxs]

    for name, score in zip(neuron_names, neuron_f_scores):
        print(f'      {name} -> {score:.2f} F-score')

    print('=======\n all-neuron genre classification (5-fold CV):')
Example #10
0
def pvalue(path):
    ''' Calculate P-value '''
    # read extracted features
    amigos_data = np.loadtxt(os.path.join(path, 'mpe', 'mpe_features.csv'),
                             delimiter=',')
    # amigos_data = amigos_data[:, :500] # EEG
    # amigos_data = amigos_data[:, 500:515] # ECG
    amigos_data = amigos_data[:, 515:]  # GSR

    # read labels and split to 0 and 1 by
    a_labels, v_labels = read_labels(os.path.join(path, 'label.csv'))

    # calculate p-value
    _, a_pvalues = f_classif(amigos_data, a_labels)
    _, v_pvalues = f_classif(amigos_data, v_labels)

    # arousal
    sel_idx = np.argsort(a_pvalues)[:20]
    a_saved_name = []
    for idx in sel_idx:
        a_saved_name.append(FEATURE_NAMES[idx])

    # valence
    sel_idx = np.argsort(v_pvalues)[:0]
    v_saved_name = []
    for idx in sel_idx:
        v_saved_name.append(FEATURE_NAMES[idx])
    with open('data/s_gsr_rcmpe_name', 'w') as f:
        for name in a_saved_name:
            f.write("{}\n".format(name))
        for name in v_saved_name:
            f.write("{}\n".format(name))

    print('Arousal')
    for idx in np.argsort(a_pvalues)[:3]:
        print(FEATURE_NAMES[idx], a_pvalues[idx])

    print('Valence')
    for idx in np.argsort(v_pvalues)[:3]:
        print(FEATURE_NAMES[idx], v_pvalues[idx])

    print('\nUse Arousal Labels')
    print("Number of features (p < 0.05): {}".format(
        a_pvalues[a_pvalues < 0.05].size))
    for i in range(a_pvalues[a_pvalues < 0.05].size):
        print("Features: {}, Value: {:.4f}".format(
            FEATURE_NAMES[np.where(a_pvalues < 0.05)[0][i]],
            a_pvalues[np.where(a_pvalues < 0.05)[0][i]]))

    print('\nUse Valence Labels')
    print("Number of features (p < 0.05): {}".format(
        v_pvalues[v_pvalues < 0.05].size))
    for i in range(v_pvalues[v_pvalues < 0.05].size):
        print("Features: {}, Value: {:.4f}".format(
            FEATURE_NAMES[np.where(v_pvalues < 0.05)[0][i]],
            v_pvalues[np.where(v_pvalues < 0.05)[0][i]]))
def featureSelection(reduced_features,labels,clnd_features,percentile,n_components,results=False):
    """
        Parameters: 
            reduced_features = Unique feature names in python list after dropping non-numeric
            feaures. 
            labels = ground truth labels for the data points.
            clnd_features = data point features in numpy array format corresponding
            to the labels.
            percentile= the parameter for the SelectPercentile method;
            between 0.0-1.0.
            n_components = the n_components for the pca. 
            results = False returns python list of selected features. If True
            returns the metrics of the feature selectors (F-statistic, and p-values from
            f_classif) and the top 'n' pca component variance measurements. 
    
        Output: 
           Resulting list of feature from the SelectPercentile function and the 
           number of principle components used. If p_results = True then the 
           statistics of the SelectPercentile method using f_classif will be printed.
           In addition the explained variance of the top 'x' principle components will
           also be printed.
    """
    from sklearn.feature_selection import SelectPercentile, f_classif
    from sklearn.decomposition import PCA 
    from itertools import compress
    
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit_transform(clnd_features, labels)
    
    pca = PCA(n_components = n_components)
    pca.fit_transform(clnd_features, labels)
    
    if results == True:
    
        f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\
                         key = lambda x: x[1], reverse=True)
        
        p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\
                        key = lambda x: x[1])
  
        expl_var = pca.explained_variance_ratio_
        
        return f_stat,p_vals,expl_var
    else:
        ## return a boolean index of the retained features 
        retained_features = selector.get_support()
        
        ## index the original features by the boolean index of top x% features 
        ## return a python list of the features to be used for training 
        features_list = list(compress(reduced_features[1:],retained_features))
    
        ## add back in the 'poi' to the first position in the final features list
        features_list.insert(0,'poi')
        
        return features_list
Example #12
0
def plot_f_test(all_ds, ft_names="MRIQC", save_path=None):
    margin = 0
    size = len(all_ds)
    plt.figure(figsize=(20, 15))
    for ds_name, data in all_ds.items():
        x, y = data["x"], data["y"]

        if "site" in x.columns:
            x = data["x"].copy()
            x["site"] = encode_vals(x["site"])
        x_data, y_data = x.values, y.values
        fval, pval = f_classif(x_data, y_data)
        ticks = np.array(range(0, size * len(fval) + len(fval), size + 1))
        #### F VALUE
        #plt.figure(figsize=(20, 15))
        plt.barh(ticks + margin, fval, height=.50, label=ds_name)
        plt.yticks(ticks + margin, x.columns, rotation="horizontal")
        plt.ylabel("Feature")
        plt.xlabel("F-values")
        margin += 1.0
    plt.title(
        "F-value of the {} features for various datasets".format(ft_names))
    plt.legend()
    if save_path:
        plt.savefig(opj(save_path, "fvals_{}_{}.png".format(ds_name,
                                                            ft_names)))

    plt.figure(figsize=(20, 15))
    margin = 0
    for ds_name, data in all_ds.items():
        x, y = data["x"], data["y"]

        if "site" in x.columns:
            x = data["x"].copy()
            x["site"] = encode_vals(x["site"])
        x_data, y_data = x.values, y.values
        fval, pval = f_classif(x_data, y_data)
        ticks = np.array(range(0, size * len(pval) + len(pval), size + 1))
        #### F VALUE
        #plt.figure(figsize=(20, 15))
        plt.barh(ticks + margin, 1 - pval, height=.50, label=ds_name)
        plt.yticks(ticks + margin, x.columns, rotation="horizontal")
        plt.ylabel("Feature")
        plt.xlabel("1-P-value")
        margin += 1.0

    plt.title(
        "P-values of the {} features for various datasets".format(ft_names))
    plt.legend()
    if save_path:
        plt.savefig(opj(save_path, "pvals_{}_{}.png".format(ds_name,
                                                            ft_names)))
Example #13
0
 def fit(self, X, y):
     if self.type == "f_value":
         vals = f_classif(X, y)[0]
         self.index = np.where(vals < self.threshold)[0]
     if self.type == "p_value":
         vals = f_classif(X, y)[1]
         self.index = np.where(vals < self.threshold)[0]
     if self.type == "mutual_info":
         vals = mutual_info_classif(X, y)
         self.index = np.where(vals > self.threshold)[0]
     if self.type == "chi2":
         vals = chi2(X, y)[0]
         self.index = np.where(vals < self.threshold)[0]
     return self
def return_2allele_Pval_df(assoc_test_type, two_clusterPercent_df, two_clst_df,
                           X_allez, y1):
    two_clusterPval_df = two_clusterPercent_df.copy()
    for cl1, row in two_clusterPercent_df.iterrows():
        for cl2 in row.index:
            #         print cl1, cl2
            #         print len(cl1.split("_")), len(cl2.split("_")[1])

            if cl1.split("_")[0] != cl2.split("_")[0]:
                if len(cl1.split("_")[1]) == 0:
                    if len(cl2.split("_")[1]) > 0:
                        allele_pVal_dict = allele_cooccurence_pValue(
                            cl1, cl2, y1, X_allez, assoc_test_type)
                        two_clusterPval_df.loc[
                            cl1, cl2] = allele_pVal_dict[cl2]["pVal"]

                elif len(cl2.split("_")[1]) == 0:
                    if len(cl1.split("_")[1]) > 0:
                        allele_pVal_dict = allele_cooccurence_pValue(
                            cl1, cl2, y1, X_allez, assoc_test_type)
                        two_clusterPval_df.loc[
                            cl1, cl2] = allele_pVal_dict[cl1]["pVal"]

                else:
                    if two_clst_df.loc[cl1, cl2] != "-":
                        allele_pVal_dict = allele_cooccurence_pValue(
                            cl1, cl2, y1, X_allez, assoc_test_type)
                        two_clusterPval_df.loc[
                            cl1, cl2] = allele_pVal_dict["cooccurence"]["pVal"]

            elif cl1.split("_")[0] == cl2.split("_")[0] and len(
                    cl2.split("_")[1]) > 0:
                if assoc_test_type == "chi2":
                    test_single, pVal_single = chi2(X_allez[[cl2, cl2]], y1)
                elif assoc_test_type == "f_classif":
                    test_single, pVal_single = f_classif(
                        X_allez[[cl2, cl2]], y1)
                two_clusterPval_df.loc[cl1, cl2] = pVal_single[0]

            elif cl1.split("_")[0] == cl2.split("_")[0] and len(
                    cl1.split("_")[1]) > 0:
                if assoc_test_type == "chi2":
                    test_single, pVal_single = chi2(X_allez[[cl1, cl1]], y1)
                elif assoc_test_type == "f_classif":
                    test_single, pVal_single = f_classif(
                        X_allez[[cl1, cl1]], y1)
                two_clusterPval_df.loc[cl1, cl2] = pVal_single[0]

    return two_clusterPval_df
Example #15
0
def train_test(x_train, x_test, y_train, y_test):
    select = SelectPercentile(percentile=75)
    select.fit(x_train, y_train)
    x_train_selected = select.transform(x_train)
    f, p = f_classif(x_train, y_train)
    plt.figure()
    plt.plot(p, 'o')
    plt.xlabel('Counts of Features')
    plt.ylabel('F-score')
    plt.title('The sample distribution of F-Test')
    plt.show()
    mask = select.get_support()
    plt.matshow(mask.reshape(1, -1), cmap='gray_r')
    plt.title('Feature selection percentage')
    plt.show()
    x_test_selected = select.transform(x_test)

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(x_train_selected, y_train)
    tree_pre = clf.predict(x_test_selected)
    tree.plot_tree(clf)
    plt.title('Tree model plot')
    plt.show()
    print("Decision Tree Classifier Accuracy - " +
          str(100 * accuracy_score(tree_pre, y_test)) + "%")

    ra = RandomForestClassifier(n_estimators=500,
                                n_jobs=5,
                                max_depth=50,
                                random_state=0)
    ra.fit(x_train_selected, y_train)
    ra_pre = ra.predict(x_test_selected)
    print("Random Forest Accuracy - " +
          str(100 * accuracy_score(ra_pre, y_test)) + "%")
def main(file, oligosList):     
    data = pd.read_csv(file, sep='\t', header = None)
    data = pd.DataFrame(data)
    data = data.dropna(axis=1,how='all')
    classifier = data.tail(1)
    del classifier[0]
    classifier = classifier.unstack()
    dataCropped = pd.DataFrame([])
    for i in oligosList:
        for j in data[0]:
            if j == i:
                dt = data.loc[data[0] == i]
                dataCropped = pd.concat([dataCropped, dt])
    data = pd.DataFrame.transpose(dataCropped)
    #Removing oligo names from dataset
    data = data.drop(data.index[0])
    #For selecting a number of best features:
    #result = SelectKBest(f_classif, k="all").fit_transform(data, classifier)
    result = f_classif(data, classifier)
    Fval = result[0]
    Pval = result[1]
    result = pd.DataFrame(data = [Fval, Pval], index = ['F-score', 'p-value'], columns = oligosList)
    #print(result)
    outputFile(result, file, oligosList)
    closeFunc()
Example #17
0
    def select(self, dataframe: 'pd.DataFrame', y_column: str) -> list:
        '''
            Selecting the most important columns
        :param dataframe: pandas DataFrame
             Data Frame on which the algorithm is applied
        :param y_column: str
             The column name of the value that we what to predict
        :return: list
            The list of features that are selected by the algorithm as the best one
        '''
        # Defining the list with names of columns except the predicted one
        X_columns = [col for col in dataframe.columns if col != y_column]

        # Creating the F and p-value history dictionaries
        self.F_history = {}
        self.p_value_history = {}
        for col in X_columns:
            self.F_history[col] = []
            self.p_value_history[col] = []

        # Defining the feature states
        feature_state = list(np.ones(len(X_columns)))
        while True:
            self.iter += 1

            # Extracting the selected columns
            X_cols = self.bin_to_cols(feature_state, X_columns)
            X = dataframe[X_cols].values
            y = dataframe[y_column].values

            # Choosing different strategy depending whatever it is a classification or regression.
            if self.classification:
                F_vals, p_vals = f_classif(X, y)
            else:
                F_vals, p_vals = f_regression(X, y)
            index = 0
            for col in X_columns:
                if col in X_cols:
                    self.F_history[col].append(float(F_vals[index]))
                    self.p_value_history[col].append(float(p_vals[index]))
                    index += 1
                else:
                    self.F_history[col].append(-1)
                    self.p_value_history[col].append(-1)

            # Choosing the max value of p-value
            max_PValue = max(p_vals)

            # Erasing the column with the p-value equal with the max value of the p-value, if the max value is
            # higher than significance level
            if max_PValue > self.significance_level:
                for j in range(len(X_cols)):
                    if p_vals[j].astype(float) == max_PValue:
                        feature_state[X_columns.index(X_cols[j])] = 0
            else:
                break

        # Returning the chose columns.
        self.choosed_cols = self.bin_to_cols(feature_state, X_columns)
        return self.choosed_cols
Example #18
0
    def _SelectKBest(self, X, y):

        print('Selecting K Best from whole image')

        from sklearn.feature_selection import SelectKBest, f_classif

        # ### Define the dimension reduction to be used.
        # Here we use a classical univariate feature selection based on F-test,
        # namely Anova. The number of features to be selected is set to 784
        feature_selection = SelectKBest(f_classif, k=self.k_features)

        feature_selection.fit(X, y)

        scores = f_classif(X, y)[0]
        mask_k_best = np.zeros(scores.shape, dtype=bool)
        mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\
            = 1
        import nibabel
        mask_brain_img = nibabel.load(self.mask_non_brain).get_data()
        mask_brain = mask_brain_img.flatten().astype(bool)

        roi = np.zeros(mask_brain.flatten().shape)
        roi[mask_brain] = mask_k_best
        roi = roi.reshape(mask_brain_img.shape)

        img = nibabel.Nifti1Image(roi, np.eye(4))
        img.to_filename('/tmp/best.nii.gz')

        print('SelectKBest data reduction from: %s' % str(X.shape))
        X = feature_selection.transform(X)
        print('SelectKBest data reduction to: %s' % str(X.shape))

        self.feature_reduction_method = feature_selection

        return X
Example #19
0
def test_randomized_logistic():
    # Check randomized sparse logistic regression
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False,
                                       C=1.,
                                       random_state=42,
                                       scaling=scaling,
                                       n_resampling=50,
                                       tol=1e-3)
    X_orig = X.copy()
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(X, X_orig)  # fit does not modify X
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False,
                                       C=[1., 0.5],
                                       random_state=42,
                                       scaling=scaling,
                                       n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]])
    assert_raises(ValueError, clf.fit, X, y)
Example #20
0
    def compute_scoring_func(self, func):
        if func == 'variance':
            features = self.instances.features.get_values()
            annotations = self.instances.annotations.get_labels()
            if isinstance(features, spmatrix):
                variance = mean_variance_axis(features, axis=0)[1]
            else:
                variance = features.var(axis=0)
            return variance, None

        features = self.annotated_instances.features.get_values()
        annotations = self.annotated_instances.annotations.get_supervision(
            self.multiclass)
        if func == 'f_classif':
            return f_classif(features, annotations)
        elif func == 'mutual_info_classif':
            if isinstance(features, spmatrix):
                discrete_indexes = True
            else:
                features_types = self.instances.features.info.types
                discrete_indexes = [
                    i for i, t in enumerate(features_types)
                    if t == FeatureType.binary
                ]
                if not discrete_indexes:
                    discrete_indexes = False
            return (mutual_info_classif(features,
                                        annotations,
                                        discrete_features=discrete_indexes),
                    None)
        elif func == 'chi2':
            return chi2(features, annotations)
        else:
            assert (False)
Example #21
0
def generateStopWordsText(new_sample_set, sub_num):
    """生成stop word文件,并返回其文件名
	"""
    stopWord_file = "model\\stopWordDoc_%d.pkl" % sub_num

    y, texts = [], []
    for eachline in new_sample_set:
        label, string = eachline.strip('\n').split('\t', 1)

        y.append(label)
        texts.append(' '.join(jieba.cut(string)))
    # String -> feature vector
    Tfidf_vectorizer = TfidfVectorizer()  #建立 tf-idf 特征生成器
    Tfidf_vectorizer.fit(texts)  #  拟合 (建模时应该保存)
    X = Tfidf_vectorizer.transform(texts)

    words_dict = Tfidf_vectorizer.vocabulary_  # 词位置dict
    words_list = list(
        map(lambda wc: wc[0],
            sorted(words_dict.items(), key=lambda asd: asd[1], reverse=False)))
    f_score, p_val = f_classif(X, y)  # Anova

    stopword = dict()
    for i in range(len(words_list)):

        if (f_score[i] <= 1.0 or p_val[i] > 0.05):
            stopword[words_list[i]] = 1

    f = open(stopWord_file, 'wb')
    pickle.dump(stopword, f)
    f.close()

    return stopWord_file
Example #22
0
def get_relevance(feat, y_class, relevance_func='mutual_info'):
    from sklearn.feature_selection import \
        chi2, f_classif, mutual_info_classif
    from scipy.stats import kruskal

    feat = np.array(feat)

    if isinstance(relevance_func, str):
        if relevance_func == 'f_classif':
            relevance, _ = f_classif(feat, y_class)
        elif relevance_func == 'chi2':
            relevance, _ = chi2(feat, y_class)
        elif relevance_func == 'mutual_info':
            relevance = mutual_info_classif(feat, y_class)
        elif relevance_func == 'kruskal':
            relevance = np.zeros(feat.shape[1])
            for i, ft in enumerate(feat.T):
                try:
                    relevance[i], _ = kruskal(
                        *[ft[y_class == iy] for iy in np.unique(y_class)])
                except:
                    relevance[i] = np.nan
    else:
        feat = np.array(feat)
        relevance = np.zeros(feat.shape[1])
        for i in range(feat.shape[1]):
            relevance[i] = relevance_func(feat[:, i], y_class)

    return relevance
Example #23
0
    def two_way(self):

        # multiply all pairs and add result to matrix as new features.
        sz = self.sz
        if self.tWay == True:
            if self.fit == True:
                for i in range(0, sz - 1):
                    for j in range(i + 1, sz):
                        #print i,j,sz,'2-WAY ---- SEBO SHA3\'AAAAL'
                        newCol = np.multiply(self.numData[:, i],
                                             self.numData[:, j])
                        newCol[newCol == -0] = 0

                        temp = np.zeros((newCol.shape[0], 1))
                        for m in range(newCol.shape[0]):
                            temp[m] = newCol[m]
                        temp = np.matrix(temp)
                        f, _ = f_classif(temp, self.target)
                        if f[0] >= 1:
                            self.two_way_list.append((i, j))
                            self.numData = np.column_stack(
                                (self.numData, newCol))
            else:
                for i in range(len(self.two_way_list)):

                    newCol = np.multiply(
                        self.numData[:, self.two_way_list[i][0]],
                        self.numData[:, self.two_way_list[i][1]])

                    self.numData = np.column_stack((self.numData, newCol))
Example #24
0
    def _SelectKBest(self, X, y):

        print('Selecting K Best from whole image')

        from sklearn.feature_selection import SelectKBest, f_classif

        # ### Define the dimension reduction to be used.
        # Here we use a classical univariate feature selection based on F-test,
        # namely Anova. The number of features to be selected is set to 784
        feature_selection = SelectKBest(f_classif, k=self.k_features)

        feature_selection.fit(X, y)

        scores = f_classif(X, y)[0]
        mask_k_best = np.zeros(scores.shape, dtype=bool)
        mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\
            = 1
        import nibabel
        mask_brain_img = nibabel.load(self.mask_non_brain).get_data()
        mask_brain = mask_brain_img.flatten().astype(bool)

        roi = np.zeros(mask_brain.flatten().shape)
        roi[mask_brain] = mask_k_best
        roi = roi.reshape(mask_brain_img.shape)

        img = nibabel.Nifti1Image(roi, np.eye(4))
        img.to_filename('/tmp/best.nii.gz')

        print('SelectKBest data reduction from: %s' % str(X.shape))
        X = feature_selection.transform(X)
        print('SelectKBest data reduction to: %s' % str(X.shape))

        self.feature_reduction_method = feature_selection

        return X
 def independance_feature(self):
     res = np.zeros((len(self.table_of_truth.features_name),
                     len(self.table_of_truth.features_name)))
     for i, _ in enumerate(tqdm(self.table_of_truth.features_name)):
         if i < len(self.table_of_truth.features_name) - 1:
             feature_name_ici = str(self.table_of_truth.features_name[i])
             X = self.df.drop(feature_name_ici, axis=1)
             y = self.df[feature_name_ici]
             chi_scores = f_classif(X, y)
             p_values = pd.Series(chi_scores[1], index=X.columns)
             p_values.sort_values(ascending=False, inplace=True)
             for index_index, index_v in enumerate(p_values.index):
                 index_v_new = self.table_of_truth.features_name.index(
                     index_v)
                 res[i, int(index_v_new)] = p_values.values[index_index]
             del X, y
             if len(self.df.columns) > 1:
                 self.df = self.df.drop(feature_name_ici, axis=1)
     self.res2 = res + res.T
     df2 = pd.DataFrame(self.res2,
                        index=self.table_of_truth.features_name,
                        columns=self.table_of_truth.features_name)
     """
     vals = np.around(df2.values, 2)
     colours = plt.cm.RdBu(vals)
     fig = plt.figure(figsize=(100, 100))
     fig.add_subplot(111, frameon=True, xticks=[], yticks=[])
     plt.table(cellText=vals, rowLabels=df2.index, colLabels=df2.columns,
                           colWidths=[0.03] * vals.shape[1], loc='center',
                           cellColours=colours)
     plt.savefig(self.path_save_model + "COMPARASION INTRA FEATURES XI 2.png")
     """
     df2.to_csv(self.path_save_model +
                "COMPARASION INTRA FEATURES XI 2.csv")
Example #26
0
def feature_removal(model,X,y,n_fold=3,maxiter=10,verbose=True,seed=6):
    # initial benchmark
    scoretr, scorecv, impf = kf_score_impf(model,X,y,n_fold,mask=None,seed=seed)
    cvscore_to_beat =  np.mean(scorecv)
    for i in range(maxiter):
        # feature importances
        impf = pd.Series(np.mean(impf,axis=0),X.columns); impf.sort()
        # feature p-values
        pval = pd.Series(f_classif(X,y)[0],X.columns); pval.sort()
        # select candidates for both methodologies and score removing that feature
        impf_candidate, pval_candidate = impf.index[0], pval.index[0]
        scoretr_impf, scorecv_impf, impf_impf = kf_score_impf(model,X.drop(impf_candidate,1),y,n_fold,mask=None,seed=seed)
        scoretr_pval, scorecv_pval, impf_pval = kf_score_impf(model,X.drop(pval_candidate,1),y,n_fold,mask=None,seed=seed)
        scorecv_impf, scorecv_pval = np.mean(scorecv_impf), np.mean(scorecv_pval)
        best_cvscore = max(scorecv_impf, scorecv_pval)

        if (best_cvscore - cvscore_to_beat) < 0.0005: break
        else:
            use_impf = True if best_cvscore == scorecv_impf else False
            candidate = impf_candidate if use_impf else pval_candidate
            if verbose:
                print "removing '%s' | previous %.4f | new %.4f | improvement %.4f" % (
                    candidate, cvscore_to_beat, best_cvscore, best_cvscore - cvscore_to_beat)
            cvscore_to_beat = best_cvscore
            X = X.drop(candidate,1)
    return X.columns
def anova_feature_selection(features,
                            targets,
                            pval_threshold=0.05,
                            debug=False):
    selected_features = {}
    for label in targets.columns:

        ## select rows where target is not null
        mask = targets[label].notna().values
        y = targets.loc[mask, label]
        X = features.loc[mask, :].copy()

        ## ANOVA feature-selection
        f, pval = f_classif(X, y)
        f = pd.Series(f).replace([np.inf, -np.inf], np.nan)
        pval[f.isna()] = np.nan

        ## select features with p-val< pval_threshold
        selected_features[label] = X.columns[pval < pval_threshold]

        if debug:
            print('target=', label)
            print('features dimension=', X.shape)
            print('#selected features=', selected_features[label].size)
            print('#' * 40)
    return selected_features
Example #28
0
def test_randomized_logistic_sparse():
    # Check randomized sparse logistic regression on sparse data
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    # center here because sparse matrices are usually not centered
    # labels should not be centered
    X, _, _, _, _ = _preprocess_data(X, y, True, True)

    X_sp = sparse.csr_matrix(X)

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False,
                                       C=1.,
                                       random_state=42,
                                       scaling=scaling,
                                       n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    clf = RandomizedLogisticRegression(verbose=False,
                                       C=1.,
                                       random_state=42,
                                       scaling=scaling,
                                       n_resampling=50,
                                       tol=1e-3)
    feature_scores_sp = clf.fit(X_sp, y).scores_
    assert_array_equal(feature_scores, feature_scores_sp)
Example #29
0
    def _binning(self, X, y=None):
        num_windows_per_inst = math.ceil(self.series_length / self.window_size)
        dft = np.array([
            self._mcb_dft(X[i, :], num_windows_per_inst)
            for i in range(self.n_instances)
        ])
        dft = dft.reshape(len(X) * num_windows_per_inst, self.dft_length)

        if y is not None:
            y = np.repeat(y, num_windows_per_inst)

        if self.anova and y is not None:
            non_constant = np.where(~np.isclose(dft.var(
                axis=0), np.zeros_like(dft.shape[1])))[0]

            # select word-length many indices with best f-score
            if self.word_length <= non_constant.size:
                _, p = f_classif(dft[:, non_constant], y)
                self.support = non_constant[np.argsort(p)][:self.word_length]

            # sort remaining indices
            self.support = np.sort(self.support)

            # select the Fourier coefficients with highest f-score
            dft = dft[:, self.support]
            self.dft_length = np.max(self.support) + 1
            self.dft_length = self.dft_length + self.dft_length % 2  # even

        if self.binning_method == "information-gain":
            return self._igb(dft, y)
        else:
            return self._mcb(dft)
Example #30
0
def find_pval(feat, tar):
    anova = f_classif(feat, ravel(tar))
    feat_anova_df = DataFrame([{
        'f_stat': f,
        'p_val': p
    } for f, p in zip(anova[0], anova[1])])
    return feat_anova_df['p_val']
def filter_methods_classification(X, y, feat_names, rotation=False):

    angle = 0
    if rotation:
        angle = 90

    # do calculations
    f_test, _ = f_classif(X, y)
    f_test /= np.max(f_test)

    mi = mutual_info_classif(X, y)
    mi /= np.max(mi)

    # do some plotting
    plt.figure(figsize=(20, 4))

    plt.subplot(1, 2, 1)
    plt.bar(range(X.shape[1]), f_test, align="center")
    plt.xticks(range(X.shape[1]), feat_names, rotation=angle)
    plt.xlabel('features')
    plt.ylabel('Ranking')
    plt.title('$F-test$ score')

    plt.subplot(1, 2, 2)
    plt.bar(range(X.shape[1]), mi, align="center")
    plt.xticks(range(X.shape[1]), feat_names, rotation=angle)
    plt.xlabel('features')
    plt.ylabel('Ranking')
    plt.title('Mutual information score')

    plt.show()
 def fScore(self):
     """ Univariate feature selection with F-test for feature scoring
         
         Compute the 1-way ANOVA F-value for the provided sample.
         The null hypothesis (H0) is that both attributes have the same population mean.
         This tests whether or not all the different classes of Y have the same mean as X
         See: https://en.wikipedia.org/wiki/F-test#One-way_ANOVA_example
         
     
     Parameters
     ----------
     X : (N,) array_like
     The set of regressors that will tested sequentially
     y : (N,) array_like
     Input
     
     Returns
     -------
     f_score : list of floats
         The computed F-value of the test.
     fpval_score : list of floats
         The associated p-value from the F-distribution.
     
     """
     
     print "Compute F-test stats..." + str(time.now())
     
     score = f_classif(self.X, self.y)
     # F values of features. The higher the score, the more probably the variables are associated
     f_score = [0 if np.isnan(s) or np.isinf(s) else s for s in score[0] ]
     
     # P-values of F-scores.
     fpval_score= [1 if np.isnan(s) or np.isinf(s) else s for s in score[1] ]
     
     return f_score, fpval_score
Example #33
0
    def fit(self, x, y):
        """Fit a pipeline with the given data x and labels y

        Args:
            x (array-like tensor): input data, shape (n_samples, I_1, I_2, ..., I_N)
            y (array-like): data labels, shape (n_samples, )

        Returns:
            self
        """
        # fit mpca
        self.mpca.fit(x)
        self.mpca.set_params(**{"return_vector": True})
        x_transformed = self.mpca.transform(x)

        # feature selection
        if self.n_features is None:
            self.n_features = x_transformed.shape[1]
            self.feature_order = self.mpca.idx_order
        else:
            f_score, p_val = f_classif(x_transformed, y)
            self.feature_order = (-1 * f_score).argsort()
        x_transformed = x_transformed[:, self.feature_order][:, : self.n_features]

        # fit classifier
        if self.auto_classifier_param:
            self.grid_search.param_grid["C"].append(1 / x.shape[0])
            self.grid_search.fit(x_transformed, y)
            self.clf = self.grid_search.best_estimator_
        if self.classifier == "svc":
            self.clf.set_params(**{"probability": True})

        self.clf.fit(x_transformed, y)
def test_randomized_logistic():
    # Check randomized sparse logistic regression
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    X_orig = X.copy()
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(X, X_orig)   # fit does not modify X
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5],
                                       random_state=42, scaling=scaling,
                                       n_resampling=50, tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]])
    assert_raises(ValueError, clf.fit, X, y)
Example #35
0
def test_randomized_logistic():
    """Check randomized sparse logistic regression"""
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False,
                                       C=1.,
                                       random_state=42,
                                       scaling=scaling,
                                       n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False,
                                       C=[1., 0.5],
                                       random_state=42,
                                       scaling=scaling,
                                       n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))
def test_f_classif_multi_class():
    """
    Test whether the F test yields meaningful results
    on a simple simulated classification problem
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    F, pv = f_classif(X, Y)
    assert (F > 0).all()
    assert (pv > 0).all()
    assert (pv < 1).all()
    assert (pv[:5] < 0.05).all()
    assert (pv[5:] > 1.0e-5).all()
Example #37
0
def feature_importance_anova(X,
                             y,
                             threshold=0.001,
                             correcting_multiple_hypotesis=True,
                             method='fdr_bh',
                             alpha=0.1,
                             sort_by='pval'):
    '''
    Provide signifance for features in dataset with anova using multiple hypostesis testing
    :X: List of dict with key as feature names and values as features
    :y: Labels
    :threshold: Low-variens threshold to eliminate low varience features
    :correcting_multiple_hypotesis: corrects p-val with multiple hypotesis testing
    :method: method of multiple hypotesis testing
    :alpha: alpha of multiple hypotesis testing
    :sort_by: sorts output dataframe by pval or F
    :return: DataFrame with F and pval for each feature with their average values 
    '''
    df = variance_threshold_on_df(pd.DataFrame.from_records(X),
                                  threshold=threshold)

    F, pvals = f_classif(df.values, y)

    if correcting_multiple_hypotesis:
        _, pvals, _, _ = multipletests(pvals, alpha=alpha, method=method)

    df['labels'] = y
    df_mean = df.groupby('labels').mean().T

    df_mean['F'] = F
    df_mean['pval'] = pvals

    return df_mean.sort_values(sort_by, ascending=True)
def feature_extraction(x,y):
    n_features = x.shape[-1]

    scores = {}
    # using p-value to evaluate features
    scores['p-value'], _ = f_classif(x, y)

    # using Logistic Regression to evaluate features
    scaleX = scale(x, copy=True)
    clf = LogisticRegression(penalty='l1').fit(scaleX, y)
    scores['LogReg'] = clf.coef_[0]

    # using Lasso to evaluate features
    clf = Lasso(0.005).fit(scaleX, y)
    scores['Lasso'] = clf.coef_

    # using LinearSVC
    clf = LinearSVC(penalty='l1', dual=False).fit(scaleX, y)
    scores['svc'] = clf.coef_[0]

    # using ensemble tree
    clf = ExtraTreesClassifier().fit(x, y)
    scores['tree'] = clf.feature_importances_
    feature_list = {}
    for tittle, score in scores.items():
        this_list = score.argsort()
        feature_list[tittle] = this_list[0:40]

    return feature_list
Example #39
0
 def compute_scoring_func(self, func):
     if func == 'variance':
         features = self.instances.features.get_values()
         annotations = self.instances.annotations.get_labels()
         return features.var(axis=0), None
     features = self.annotated_instances.features.get_values()
     annotations = self.annotated_instances.annotations.get_labels()
     if func == 'f_classif':
         return f_classif(features, annotations)
     elif func == 'mutual_info_classif':
         features_types = self.instances.features.info.types
         discrete_indexes = [
             i for i, t in enumerate(features_types)
             if t == FeatureType.binary
         ]
         if not discrete_indexes:
             discrete_indexes = False
         return (mutual_info_classif(features,
                                     annotations,
                                     discrete_features=discrete_indexes),
                 None)
     elif func == 'chi2':
         return chi2(features, annotations)
     else:
         assert (False)
def test_randomized_logistic_sparse():
    # Check randomized sparse logistic regression on sparse data
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    # center here because sparse matrices are usually not centered
    # labels should not be centered
    X, _, _, _, _ = _preprocess_data(X, y, True, True)

    X_sp = sparse.csr_matrix(X)

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    feature_scores_sp = clf.fit(X_sp, y).scores_
    assert_array_equal(feature_scores, feature_scores_sp)
Example #41
0
def thresholds():
  for name in ['ant', 'ivy', 'jedit', 'lucene', 'poi']:
    print("##", name)
    train, test = explore(dir='../Data/Jureczko/', name=name)
    data_DF=csv2DF(train, toBin=True)
    metrics=[str[1:] for str in data_DF[data_DF.columns[:-1]]]
    ubr = LogisticRegression()
    X = data_DF[data_DF.columns[:-1]].values
    y = data_DF[data_DF.columns[-1]].values
    ubr.fit(X,y)
    inter, coef, pVal = ubr.intercept_[0], ubr.coef_[0], f_classif(X,y)[1]

    table= texttable.Texttable()
    table.set_cols_align(["l","l","l"])
    table.set_cols_valign(["m","m","m"])
    table.set_cols_dtype(['t', 't', 't'])
    table_rows=[["Metric", "Threshold", "P-Value"]]

    for i in xrange(len(metrics)):
      if VARL(coef[i], inter, p0=0.05)>0 and pVal[i]<0.05:
        thresh="%0.2f"%VARL(coef[i], inter, p0=0.1)
        table_rows.append([metrics[i], thresh, "%0.3f"%pVal[i]])

    table.add_rows(table_rows)
    print(table.draw())

  # === DEBUG ===
  set_trace()
  return None
Example #42
0
def feature_selection(data, labels, n_feat):
    f,prob = f_classif(data, labels)
    k_best = SelectKBest(f_classif,k=n_feat)
    k_best.fit(data,labels)
    X_new = k_best.transform(data)
    features = k_best.get_support(indices=True)
    return f, X_new, features
def infoGain(X,y):
    X = np.array(X)
    gains = []
    featureNum = X.shape[1]
    for f in range(featureNum):
        gains.append(featureInfoGain(X,y,f))
    fValues, pValues = f_classif(X, y)
    return gains, pValues
def feature_selection(X, y):
	var_imp = f_classif(X, y)[1]
	var_imp[np.isnan(var_imp)] = 1
	imp_feature_idx = var_imp.argsort()[::-1]
	
	print('Important feature indices: %s'%(imp_feature_idx))	

	return var_imp
Example #45
0
 def feature_selection(self,mode='F'):
     
     print 'Feature Selection...'
     print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     X=self.train.copy()
     y=self.train_label['label'].values.copy()
     
     test=self.test.copy()
     
     if mode.upper()=='M':
         mi=mutual_info_classif(train.values,train_label['label'].values)
     elif mode.upper()=='F':
         F,pval=f_classif(train.values,train_label['label'].values)
     elif mode.upper()=='C':
         chi,pval=chi2(train.values,train_label['label'].values)
     
     features=self.train.columns.copy()
     
     fs_features=features.copy().tolist()
     
     if mode.upper()=='M':
         fs_V=mi.copy().tolist()
     elif mode.upper()=='F':
         fs_V=F.copy().tolist()
     elif mode.upper()=='C':
         fs_V=chi.copy().tolist()
     
     if mode.upper()=='M':
         selector=SelectPercentile(mutual_info_classif,percentile=80)
     elif mode.upper()=='F':
         selector=SelectPercentile(f_classif,percentile=80)
     elif mode.upper()=='C':
         selector=SelectPercentile(chi2,percentile=80)
         
     X_new=selector.fit_transform(X,y)
     
     selected=selector.get_support()
     
     for i in xrange(len(features)):
         if selected[i]==False:
             t=features[i]
             fs_features.remove(t)
             
     fs_V=np.array(fs_V)
     fs_features=np.array(fs_features)
     
     self.train=pd.DataFrame(X_new,columns=fs_features.tolist())
     self.test=test[fs_features]
     
     self.fs_features=fs_features
     
     feas=pd.DataFrame()
     feas['feature']=fs_features
     
     print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     return X_new,feas
Example #46
0
def feature_selection(holeID):

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import sklearn.preprocessing as pre
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    from sklearn.feature_selection import SelectPercentile
    from sklearn.feature_selection import f_classif

    cleaned = pd.read_csv('dats/%s_cleandata.csv'%holeID)
    # cleaned = pd.read_csv('dats/all_data.csv')

    cleaned.set_index('DEPTH', inplace=True)

    target = np.logical_not(cleaned.LABELS.isnull())

    cols = cleaned.columns.tolist()
    # cols.remove('Unnamed: 0')
    cols.remove('LABELS')
    cols.remove('LABELS_ROCK_TYPE')

    cleaned = cleaned[cols]

    # normalise column by col
    cleaned = (cleaned - cleaned.mean()) / (cleaned.max() - cleaned.min())

    shit = []
    for col in cols:
        if cleaned[col].isnull().sum() == len(cleaned): # find column full of nans
            # print col
            shit.append(col)

    non_empty_cols = list(set(cols).difference(set(shit)))
    # cleaned.fillna(0)


    cols = non_empty_cols
    X, y = cleaned[cols], target


    imputer = pre.Imputer(missing_values='NaN', strategy='mean')
    X = imputer.fit_transform(X)


    # blah, pval = chi2(X, y) # x can't have negative values
    blah, pval = f_classif(X,y)

    useful_feat = []
    for i, feat in enumerate(cols):
        # if scores[i] == float('inf'):
        if pval[i] == 0:
            print feat, pval[i]

            useful_feat.append(feat)

    return useful_feat
Example #47
0
def compute_f_value(x_data, y_data):
    F, pval = f_classif(x_data, y_data)
    print(F, pval)

# train_data, validation_data, test_data, basic_users_info = get_data()
# label_encoder = {}
# train_x, train_y = get_exclude_ndf_x(train_data, basic_users_info, label_encoder)
# remove_features_with_low_variance(train_x)
# compute_f_value(train_x, train_y)
    def test_f_classif(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.feature_selection.f_classif()
        expected = fs.f_classif(diabetes.data, diabetes.target)

        self.assertEqual(len(result), 2)
        tm.assert_numpy_array_equal(result[0], expected[0])
        tm.assert_numpy_array_equal(result[1], expected[1])
Example #49
0
def evalANOVA(individual,targets,toolbox):
    # Transform the tree expression in a callable function
    func = toolbox.compile(expr=individual)
    if(np.logical_not(all(np.isfinite(func)))):
        return 0.0,   
    #this returns the p-value but we use 1-x so that greater values are better
    #we have to use reshape(-1,1) because scikit learn needs arrays in the form [[0],[1.34],..etc.]
    score=1-f_classif(func.reshape(-1,1),targets)[1][0]
    if np.isnan(score):
        score=0.0

    return score,
def myselect_significant_features(entity, features, pred_varname, random_varname):

    from sklearn import feature_selection
    import pandas as pd
    import pprint
    pp = pprint.PrettyPrinter(indent=4)
    import copy

    if isinstance(entity, pd.SparseDataFrame):
        feat_f_scores, feat_p_vals = feature_selection.f_classif(entity[features].to_dense().values
                                                                ,entity[pred_varname].to_dense().values)
    else:
        feat_f_scores, feat_p_vals = feature_selection.f_classif(entity[features].values
                                                                ,entity[pred_varname].values)

    features_series = pd.Series(data=feat_p_vals, index=features)
    features_series = features_series.order()
    print "		feature p-values:"
    pp.pprint(features_series)

    features = list(features_series[features_series <= 0.05].index)
    return features
def anova_filter(_df, features):
    ret_val = set()
    X = _df[features].values
    Y = _df.Vote.values
    v = f_classif(X, Y)[1]
    i = 0

    for c in features:
        if v[i] < alpha:
            print c + " selected by anova with p-value: " + str(v[i])
            ret_val.add(c)
        i += 1
    return ret_val
def recursiveRanking(X, y):
    X = np.array(X)
    featuresNum = X.shape[1]
    clf = DecisionTreeClassifier(criterion='entropy')
    ranks = []
    ranksSet = set()
    tmp = X
    dic={}
    for i in range(featuresNum):
        clf.fit(tmp,y)
        importances = clf.tree_.compute_feature_importances()
        best = np.argmax(importances)
        if not best in ranksSet:
            ranks.append(best)
            ranksSet.add(best)
            tmp[:,best] = np.zeros(tmp.shape[0])
            dic[best] = i
        """
        if best+1 < tmp.shape[1]:
            tmp = np.concatenate((tmp[:,:best], tmp[:, best+1:]),axis=1)
        else:
            tmp = tmp[:,:best]
        """
    """
    new = ranks[:1]
    for i,r in enumerate(ranks):
        if i==0:
            continue
        left = ranks[:i]
        m = min(left)
        while m<=r and len(left)>0:
            r+=1
            del left[np.argmin(left)]
            if len(left)==0:
                break
            m = min(left)
        new.append(r)
    ranks = new
    """
    new = np.ones(featuresNum)
    for f in range(featuresNum):
        if f in dic:
            r = dic[f]
            val = 1.0/(r+1)
            #print f, r, val
        else:
            val=0
            #print f, val
        new[f] = val
    fValues, pValues = f_classif(X, y)
    return new, pValues
    def levene_f_test(self, data):
        # For each feature and each class, calculate the mean per class
        feature_columns = data.columns[:-1]
        unique_categories = np.unique(data['cat'])
        mean_per_feature_and_class = {}
        for feature in feature_columns:
            feature_mean_per_class = {}
            for category in unique_categories:
                data_feature_cat = data[(data.cat == category)][feature]
                feature_mean_per_class[category] = float(sum(data_feature_cat)/len(data_feature_cat))
            mean_per_feature_and_class[feature] = feature_mean_per_class

        # Then tranform all the data (sample_point - mean)
        for feature in feature_columns:
            data[feature] = data[[feature, 'cat']].apply((lambda x: abs(x[0] - mean_per_feature_and_class[feature][x[1]])), axis=1)

        return f_classif(data[feature_columns], np.ravel(data['cat']))
Example #54
0
 def top_error_terms(self, truths, preds, X, data):
     print('\n\nERROR ANALYSIS:\n')
     for label in self.clf.classes_:
         print('\nincorrectly labeled %s' % label)
         iserror = np.zeros(len(truths))
         ind = [i for i, (t, p) in enumerate(zip(truths, preds)) if t != p and p == label]
         iserror[ind] = 1
         corrs, _ = f_classif(X, iserror)
         pos_mask, pos_counts, neg_counts = self.get_pos_mask(X, iserror)
         corrs *= pos_mask
         for fidx in np.argsort(corrs)[::-1][:5]:
             print('\n\t%s (%d incorrect, %d correct)' %
                   (self.vectorizer.features[fidx], pos_counts[fidx], neg_counts[fidx]))
             matches = []
             for midx in range(X.shape[0]):
                 if X[midx, fidx] > 0 and iserror[midx] == 1:
                     matches.append(midx)
             for m in matches[:3]:
                 print('\t\t' + str(self.vectorizer.extract_features(data[m])))
Example #55
0
def f_score(X, y):
    """
    This function implements the anova f_value feature selection (existing method for classification in scikit-learn),
    where f_score = sum((ni/(c-1))*(mean_i - mean)^2)/((1/(n - c))*sum((ni-1)*std_i^2))

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y : {numpy array},shape (n_samples,)
        input class labels

    Output
    ------
    F: {numpy array}, shape (n_features,)
        f-score for each feature
    """

    F, pval = f_classif(X, y)
    return F
Example #56
0
def test_randomized_logistic():
    """Check randomized sparse logistic regression"""
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                scaling=scaling, n_resampling=50, tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    assert_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5],
            random_state=42, scaling=scaling, n_resampling=50, tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    assert_equal(np.argsort(F), np.argsort(feature_scores))
def select_sized_features(feature_size,fearure_vecotr,feature_indecies,y,feature_selection_measure):
    if feature_selection_measure == SelectionMeasure.chi_2:
        feature_values,p_value = chi2(fearure_vecotr,y)
    elif feature_selection_measure == SelectionMeasure.f:
        feature_values,p_value = f_classif(fearure_vecotr,y)
    else:
    # elif feature_selection_measure == SelectionMeasure.mutual_info:
        feature_values = mutual_info_classif(fearure_vecotr,y)

    feature_value_id_map = {}
    for i in range(len(feature_values)):
        feature_value_id_map[ feature_indecies[i] ] = feature_values[i]

    sorted_features = sorted(feature_value_id_map.items(),key=lambda x:x[1],reverse=True)
    selected_features = []
    for i in range(feature_size):
        if i >= len(sorted_features):
            continue
        selected_features.append( sorted_features[i][0] )

    return selected_features
def get_reliable_features(X, y):
	"""
	Purpose: get the model features related to the labels from highest f-score to lowest f-score
		for classification
	Inputs:	X: dataframe consisting of values for the different features
			y: dataframe consisting of the labels for each feature vector
	Output: a dataframe of f, p-values, and cohen's d values, sorted from highest f-value to lowest
	"""
	f, pval  = f_classif(X, y)										#Do f_classif
	feat_pval = pd.Series(data = pval < (0.05 / len(X.columns)), index = X.columns) #pvals with Bonferroni correction
	feat_f = pd.Series(data = f, index = X.columns)							#f values
	d = []
	for feat in X.columns:
		pooled_var = X.loc[y[y == 0].index][feat].var() + X.loc[y[y == 1].index][feat].var()
		mean_diff = X.loc[y[y == 0].index][feat].mean() - X.loc[y[y == 1].index][feat].mean()
		d.append(np.abs(mean_diff / np.sqrt(pooled_var)))
	cohen_d = pd.Series(data = d, index = X.columns) 
	df = pd.DataFrame()
	df['f_score'] = feat_f
	df['pval'] = feat_pval
	df['cohens_d'] = cohen_d
	df.sort_values('f_score', ascending=False, inplace=True)						#Sort by the f values
	return df
Example #59
0
def select_bestwords(D, y, nmax = 100, is_classif=True):
    """ Select nmax best correleted words in D (list of dicts) 
        with goal = y
    """
    y = np.asarray(y)
    v = DictVectorizer(sparse=True)
    try:
        X = v.fit_transform(D)
    except ValueError:
        logger.warning("===Except*** in select_bestwords D:%d y:%d",len(D),len(y))
        return (set([]))
    if is_classif:
        f=f_classif(X,y)
    else:
        f=f_regression(X,y)
    names = v.get_feature_names()
    # (F-value, p-value, word)
    a = [(f[0][i], f[1][i], names[i]) 
            for i in range(len(names))]
    a = sorted([e for e in a if e[1]<0.05], reverse=True)
    logger.debug("select_bestwords:%s",a[:16])
    top = set([ e[2] for e in a[:nmax] ])
    return top