def select_by_linearmodel(self, models=None):
        # Embedded
        '''
        线性模型
        :param models:
        :return:
        '''
        if self.numNull != 0:
            print('特征中有NaN!!!')
        elif self.numInf != 0:
            print('特征中有Inf!!!')
        else:
            if not models:
                models = [LinearRegression(),
                          Ridge(),
                          Lasso()]

            # 使用SelectFromModel训练一次,选择特征
            for model in models:
                model_name = str(model).split('(')[0]
                selector = SelectFromModel(model, max_features=self.K, threshold=-np.inf)
                selector.fit_transform(X=self.train_X, y=self.train_y)
                mask = selector.get_support(True)
                feature_names = np.array(self.continuous_feature_names)[mask]
                print("{} selected feature:{}".format(model_name, feature_names))

            if self.showFig:
                for clf in models:
                    model_name = str(clf).split('(')[0]
                    model = clf.fit(self.train_X, self.train_y)
                    self.dict_features_score(model.coef_)
                    # print(sorted(dict(zip(self.continuous_feature_names, model.coef_)).items(), key=lambda x: x[1], reverse=True))
                    sns.barplot(self.continuous_feature_names, abs(model.coef_))
                    plt.title('{} coef of features'.format(model_name))
                    plt.show()
def test_threshold_and_max_features():
    X, y = datasets.make_classification(
        n_samples=1000,
        n_features=10,
        n_informative=3,
        n_redundant=0,
        n_repeated=0,
        shuffle=False,
        random_state=0,
    )
    est = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer1 = SelectFromModel(estimator=est,
                                   max_features=3,
                                   threshold=-np.inf)
    X_new1 = transformer1.fit_transform(X, y)

    transformer2 = SelectFromModel(estimator=est, threshold=0.04)
    X_new2 = transformer2.fit_transform(X, y)

    transformer3 = SelectFromModel(estimator=est,
                                   max_features=3,
                                   threshold=0.04)
    X_new3 = transformer3.fit_transform(X, y)
    assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
    selected_indices = transformer3.transform(
        np.arange(X.shape[1])[np.newaxis, :])
    assert_allclose(X_new3, X[:, selected_indices[0]])
Beispiel #3
0
def SelectFromModel_selector(estimator, threshold, X_data, Y_data):
    columns = X_data.columns
    selector = SelectFromModel(estimator, threshold = threshold)
    selector.fit_transform(X_data, Y_data)
    labels = [columns[x] for x in selector.get_support(indices=True)]    
    feature = pd.DataFrame(selector.fit_transform(X_data, Y_data), columns=labels)
    return feature
Beispiel #4
0
def FeatureSelect(df, YcolName, featSelectMethod="SFM", printNumCoeff=False):

    X = df.iloc[:, EXPRESSION_START:]  #get all expressions
    Y = df[YcolName].to_numpy()  #the output we want to predict
    Yclass = label_binarize(Y, list(set(Y)))  #get the binarized value
    best_model, coeff_used = grid_search(X, Yclass)  #get best model and alpha
    if (printNumCoeff):
        print("Number of coefficients used ", coeff_used)

    best_model.fit(X, Yclass)  #fit best model
    predictor = OneVsRestClassifier(SVC(C=1, kernel='linear',
                                        probability=True))
    Xred = None
    if featSelectMethod == "SFM":
        selector = SFM(best_model, prefit=True)
        Xred = selector.transform(X)

    elif featSelectMethod == "PCA":
        selector = PCA(n_components=PCA_TSNE_COMP, random_state=RANDOM_STATE)
        Xred = selector.fit_transform(X)

    elif featSelectMethod == "tSNE":
        selector = TSNE(n_components=PCA_TSNE_COMP, random_state=RANDOM_STATE)
        Xred = selector.fit_transform(X)

    return Xred, Yclass, predictor
def select_from_model(df):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    features = X.columns
    clf = RandomForestClassifier(random_state=9)
    model = SelectFromModel(clf)
    model.fit_transform(X, y)
    return features[model.get_support()].tolist()
def test_max_features_callable_data(max_features):
    """Tests that the callable passed to `fit` is called on X."""
    clf = RandomForestClassifier(n_estimators=50, random_state=0)
    m = Mock(side_effect=max_features)
    transformer = SelectFromModel(estimator=clf,
                                  max_features=m,
                                  threshold=-np.inf)
    transformer.fit_transform(data, y)
    m.assert_called_with(data)
Beispiel #7
0
def selectDecisionTree(x_train_ds, y_train_ds, x_test_ds, y_test_ds,
                       max_features):
    x_train = SelectFromModel(ExtraTreesClassifier(n_estimators=100),
                              max_features=max_features)
    x_train = x_train.fit_transform(x_train_ds, y_train_ds)
    x_test = SelectFromModel(ExtraTreesClassifier(n_estimators=100),
                             max_features=max_features)
    x_test = x_test.fit_transform(x_test_ds, y_test_ds)
    return x_train, x_test
def selectRandomForests(x_train_ds, y_train_ds, x_test_ds, y_test_ds,
                        max_features):
    x_train = SelectFromModel(RandomForestClassifier(n_estimators=100),
                              max_features=max_features)
    x_train = x_train.fit_transform(x_train_ds, y_train_ds)
    x_test = SelectFromModel(RandomForestClassifier(n_estimators=100),
                             max_features=max_features)
    x_test = x_test.fit_transform(x_test_ds, y_test_ds)
    return x_train, x_test
def select_from_model(data):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    m = SelectFromModel(estimator=RandomForestClassifier())
    m.fit_transform(X, y)
    a = X.columns
    b = a[m.get_support()]
    c = b.tolist()
    return c
    def select_features_from_model(self, x, y):

        selector = SelectFromModel(estimator=LogisticRegression().fit(x, y), threshold=self.threshold,
                                   prefit=self.prefit, norm_order=self.norm_order, max_features=self.max_features)
        selector.fit_transform(x, y)
        features = selector.get_support(indices=True)
        self.best_features = [column for column in x.columns[features]]
        x_select = self.select_features_in_test_set(x)

        return x_select
Beispiel #11
0
def select_from_model(data):
    X = data.drop('SalePrice', axis=1)
    y = data['SalePrice']
    rf_model = RandomForestClassifier()

    select_fm = SelectFromModel(rf_model)
    select_fm.fit_transform(X, y)

    #print (list(X.columns[select_fm.get_support()]))

    return list(X.columns[select_fm.get_support()])
def select_from_model(data):
    X = data.drop('SalePrice',axis=1)
    y = data['SalePrice']
    
    model = RandomForestClassifier()
    
    sfm = SelectFromModel(model)
    sfm.fit_transform(X,y)
    
    feature_name = list(X.columns[sfm.get_support()])
    
    return feature_name
def random_forest(data_set, y_values, want_graph, random_state, max_depth):
    model = RandomForestRegressor(random_state=random_state,
                                  max_depth=max_depth)
    #ovde radi one hot encoding
    data_set = pd.get_dummies(data_set)
    model.fit(data_set, y_values)

    indices = []
    if want_graph:
        features = data_set.columns
        importances = model.feature_importances_
        indices = np.argsort(importances)
        plt.title('Feature Importances')
        plt.barh(range(len(indices)),
                 importances[indices],
                 color='b',
                 align='center')
        plt.yticks(range(len(indices)), [features[i] for i in indices])
        plt.xlabel('Relative Importance')
        plt.show()

    feature = SelectFromModel(model)
    fit = feature.fit_transform(data_set, y_values)

    return fit
Beispiel #14
0
def featureSelectionFromModel(patient, mrna, trainingData, type):
	cols = [col for col in mrna.columns]
	x = trainingData[cols].copy()
	x.drop('track_name', axis=1, inplace=True)
	if (type == 'cancerStage' or type == 'grade' or type == 'survivalDays'):
		x = pd.concat([x[:], trainingData['diagonosisAge'], trainingData['mutation']], axis=1)
	if (type == 'vitalStatus'):
		x = pd.concat([x[:], trainingData['diagonosisAge'], trainingData['mutation'], trainingData['survivalDays']], axis=1)
	
	y = trainingData[type]
	
	#feature selection from model
	if (type == 'survivalDays'):
		clf = DecisionTreeRegressor()
	else:
		clf = DecisionTreeClassifier()
	trans = SelectFromModel(clf, threshold='median') #0.01
	xTrans = trans.fit_transform(x, y)
	
	columnsSelected = x.columns[trans.get_support()].values
	print ('Selected features from model for ', type, ': Total numbers is ', len(columnsSelected),  '\nFeatures are: \n', columnsSelected)
	
	#Plot feature_importances_
	clf.fit(x, y)
	plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
	plt.xticks(range(len(clf.feature_importances_)), x.columns, rotation=270)
	plt.title('Feature Importance for ' + type)
	plt.show()
	
	dfSelectedFeatures = pd.DataFrame(xTrans, columns=columnsSelected)
	dfSelectedFeatures4Merge = pd.concat([dfSelectedFeatures[:], trainingData['track_name']], axis=1)
	
	otherColumns = patient.columns.difference(dfSelectedFeatures.columns)
	trainingData = pd.merge(dfSelectedFeatures4Merge, patient[otherColumns], on='track_name', how='inner')
	return trainingData
Beispiel #15
0
def RandomForest_select(dataframe, num):
    """
    随机森林 传进来的数据已经做好了必要的特征工程
    随机森林是⼀种⼴泛使⽤的特征选择算法,它会⾃动计算各个特征的重要性,所以⽆需单独编程。这有助于我们选择较⼩的特征⼦集。
    在开始降维前,我们先把数据转换成数字格式,因为随机森林只接受数字输⼊。同时,ID这个变量虽然是数字,但它⽬前并不重要,所以可以删去。
    Args:
        dataframe:
        num:

    Returns:

    """
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(random_state=1, max_depth=10)
    features = dataframe.columns

    model.fit(dataframe, dataframe.label[:5000])

    importances = model.feature_importances_
    indices = np.argsort(importances[0:num])  # top 10 features
    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             importances[indices],
             color='b',
             align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.show()

    from sklearn.feature_selection import SelectFromModel
    feature = SelectFromModel(model)
    Fit = feature.fit_transform(df, df.label)
    return indices
 def l1_dim_reduce(self, M):
     df = self.df
     y = df['class']
     X = pd.DataFrame(M)
     dim_reduce = SelectFromModel(LogisticRegression(solver='liblinear', class_weight='balanced', C=0.04, penalty='l1'))
     X_ = dim_reduce.fit_transform(X, y)
     return X_
Beispiel #17
0
def lasso(XTraining, YTraining, XTest, number_of_features):
    embeded_lr_selector = SelectFromModel(LogisticRegression(max_iter=1000),
                                          max_features=number_of_features)
    lasso_selected = embeded_lr_selector.fit(XTraining, YTraining)
    XTraining = embeded_lr_selector.fit_transform(XTraining, YTraining)
    XTest, m = new_features(lasso_selected, XTest)
    return XTraining, XTest, m
Beispiel #18
0
    def process_feature_label(self, feature_extraction=False):
        """
        处理feature和label,用于最后的训练
        :param feature_extraction: 是否进行特征选择
        """
        t = time()

        from sklearn.feature_extraction.text import TfidfVectorizer
        tfidf = TfidfVectorizer()

        self.data_bunch.labels = [
            self.cate_dict[label] for label in self.data_bunch.labels
        ]  # 将labels由文字变为数字
        self.data_bunch.tfidfs = tfidf.fit_transform(self.data_bunch.contents)
        print("tfidfs维度为:{}".format(self.data_bunch.tfidfs.shape))

        if feature_extraction:
            from sklearn.feature_selection import SelectFromModel
            from sklearn import svm

            selector = SelectFromModel(
                svm.LinearSVC(C=1, penalty="l1", dual=False))
            self.data_bunch.tfidfs = selector.fit_transform(
                self.data_bunch.tfidfs, self.data_bunch.labels)
            print("选择后的tfidfs维度为:{}".format(self.data_bunch.tfidfs.shape))

        print("处理feature和label, 完成!用时:{:.2f}s".format(time() - t))
Beispiel #19
0
    def learn_general(self,
                      nfold,
                      task,
                      model_label,
                      X_train,
                      y_train,
                      X_indexes,
                      outfolder,
                      feature_selection=False,
                      instance_data_source_tags=None,
                      accepted_ds_tags: list = None):
        if feature_selection:
            select = SelectFromModel(
                LogisticRegression(class_weight='balanced',
                                   penalty="l1",
                                   C=0.01))
            X_train = select.fit_transform(X_train, y_train)

        cls, model_file = self.create_classifier(outfolder, model_label, task)
        nfold_predictions = cross_val_predict(cls, X_train, y_train, cv=nfold)

        ml_util.save_scores(
            nfold_predictions,
            y_train,
            None,
            None,
            X_indexes,  # nfold index
            None,  # heldout index
            model_label,
            task,
            2,
            outfolder,
            instance_data_source_tags,
            accepted_ds_tags)
def test_max_features_dim(max_features):
    clf = RandomForestClassifier(n_estimators=50, random_state=0)
    transformer = SelectFromModel(estimator=clf,
                                  max_features=max_features,
                                  threshold=-np.inf)
    X_trans = transformer.fit_transform(data, y)
    assert X_trans.shape[1] == max_features
Beispiel #21
0
def select_rfecv_sfm(selection, features, labels):
    if selection[0] == "rfecv":
        for key, method in methods.items():
            recursive = RFECV(method,
                              step=1,
                              cv=[(range(134), range(134, 200))],
                              scoring="accuracy")
            recursive.fit(features, labels)
            # Plot number of features VS. cross-validation scores
            plt.figure()
            plt.xlabel("Number of features selected")
            plt.ylabel("accuracy score" + key +
                       " (nb of correct classifications)")
            plt.plot(range(1,
                           len(recursive.grid_scores_) + 1),
                     recursive.grid_scores_)
            plt.savefig(glbs.RESULTS_PATH + "\\" + key + ".jpg",
                        bbox_inches="tight")
    if selection[0] == "sfm":
        score = {}
        for key, method in methods.items():
            sfm = SelectFromModel(method, max_features=int(selection[1]))
            train_new = sfm.fit_transform(features[0], labels[0])
            test_new = sfm.transform(features[1])
            clf = method
            clf.fit(train_new, labels[0])
            pred = clf.predict(test_new)
            acc = accuracy_score(labels[1], pred)
            score[key] = acc
        write_sfm(score)
def logit_feature_selection(C_params):
    for C in C_params:
        est = linear_model.LogisticRegression(random_state=100,
                                              penalty="l1",
                                              C=C,
                                              tol=1e-4)
        transformer = SelectFromModel(estimator=est)
        train_features = transformer.fit_transform(X_train, Y_train)
        test_features = transformer.transform(X_test)
        print("\nWith C={}".format(C))
        print("Logistic regression reduced number of features to {}.".format(
            test_features.shape[1]))

        model = linear_model.LogisticRegression(random_state=100)
        if test_features.shape[1] <= 200:
            model = model_tune_params(model, logit_params)
        model.fit(train_features, Y_train)
        score = recall_score(y_pred=model.predict(test_features),
                             y_true=Y_test,
                             average="macro")
        print(
            "Logistic regression recall after FEATURE SELECTION: {:5f}".format(
                score))
        n_features_logit.append(test_features.shape[1])
        recall_logit.append(score)
Beispiel #23
0
    def feature_selection(self):
        """
        sklearn.feature_selection provides VarianceThreshold, recursive feature elimination (RFE), SelectFromModel.
        I use directly SelectFromModel. But RFE isn't very good, so I will write a more powerful one.
        """
        print("\n\n************Feature Selection************\n\n")
        print("\n1. -------SelectFromModel-------\n")
        ncolumns = self._training_data.shape[1]
        threshold = 1 / (ncolumns * 5)
        select_from_model = SelectFromModel(
            RandomForestClassifier(n_estimators=600),
            prefit=False,
            threshold=threshold)
        training_data1 = select_from_model.fit_transform(
            self._training_data, self._training_result)
        # get the names of remaining variables
        deleted_vars_index = select_from_model.estimator_.feature_importances_ > threshold
        names1 = self._training_data.columns[list(deleted_vars_index)]
        # convert numpy.array to pandas.DateFrame
        training_data1 = pd.DataFrame(training_data1, columns=names1)
        # save the global transformation
        self._global_transform["select_from_model"] = select_from_model

        print("\n-------Those variables are eliminated : -------\n")
        print(list(self._training_data.columns[list(~deleted_vars_index)]))

        print("\n2. -------Recursive Feature Elimination-------\n")
Beispiel #24
0
def generate_train_data(train_data, test_data, poly=False, select=False):
    y = train_data['发电量']
    X = train_data.drop(['发电量', 'ID'], axis=1)
    sub_data = test_data.drop(['ID'], axis=1)

    polynm = None
    if poly:
        from sklearn.preprocessing import PolynomialFeatures
        polynm = PolynomialFeatures(degree=2, interaction_only=True)
        X = polynm.fit_transform(X)
        sub_data = polynm.transform(sub_data)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=123)

    sm = None
    if select:
        from sklearn.feature_selection import SelectFromModel
        sm = SelectFromModel(GradientBoostingRegressor(random_state=2))
        X_train = sm.fit_transform(X_train, y_train)
        X_test = sm.transform(X_test)
        sub_data = sm.transform(sub_data)

    return X_train, X_test, y_train, y_test, sub_data, sm, polynm
Beispiel #25
0
def randomforest(XTraining, YTraining, XTest, number_of_features):
    embeded_lr_selector = SelectFromModel(
        RandomForestClassifier(n_estimators=100),
        max_features=number_of_features)
    randomforest_selected = embeded_lr_selector.fit(XTraining, YTraining)
    XTraining = embeded_lr_selector.fit_transform(XTraining, YTraining)
    XTest, m = new_features(randomforest_selected, XTest)
    return XTraining, XTest, m
def test_inferred_max_features_callable(max_features):
    """Check max_features_ and output shape for callable max_features."""
    clf = RandomForestClassifier(n_estimators=5, random_state=0)
    transformer = SelectFromModel(estimator=clf,
                                  max_features=max_features,
                                  threshold=-np.inf)
    X_trans = transformer.fit_transform(data, y)
    assert transformer.max_features_ == max_features(data)
    assert X_trans.shape[1] == transformer.max_features_
Beispiel #27
0
def check_valid_max_features(est, X, y):
    max_features = X.shape[1]
    for valid_max_n_feature in [0, max_features, 'all', 5]:
        transformer = SelectFromModel(estimator=est,
                                      max_features=valid_max_n_feature)
        X_new = transformer.fit_transform(X, y)
        if valid_max_n_feature == 'all':
            valid_max_n_feature = max_features
        assert_equal(X_new.shape[1], valid_max_n_feature)
Beispiel #28
0
def remove_based_on_select_from_model(dataframe, max_features):
    features = dataframe.drop(columns=["target"])
    target = dataframe["target"]
    model = ensemble.RandomForestRegressor()
    sfm = SelectFromModel(estimator=model, max_features=max_features)
    features_transformed = sfm.fit_transform(features, target)
    columns_kept = features.loc[:, sfm.get_support()].columns
    features_df = pd.DataFrame(features_transformed, columns=columns_kept)

    return pd.concat([features_df, target], axis=1)
def selectFromModel(df, method, **col):
    models = {
                '线性回归': LinearRegression,
                '朴素贝叶斯': GaussianNB,
                '逻辑回归': LogisticRegression,
                'SVM': SVC
                }
    model = SelectFromModel(models[method]())
    X_new = model.fit_transform(df[col['X']].values, df[col['y']].values.ravel())
    return X_new
def test_max_features():
    # Test max_features parameter using various values
    X, y = datasets.make_classification(
        n_samples=1000,
        n_features=10,
        n_informative=3,
        n_redundant=0,
        n_repeated=0,
        shuffle=False,
        random_state=0,
    )
    max_features = X.shape[1]
    est = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer1 = SelectFromModel(estimator=est, threshold=-np.inf)
    transformer2 = SelectFromModel(estimator=est,
                                   max_features=max_features,
                                   threshold=-np.inf)
    X_new1 = transformer1.fit_transform(X, y)
    X_new2 = transformer2.fit_transform(X, y)
    assert_allclose(X_new1, X_new2)

    # Test max_features against actual model.
    transformer1 = SelectFromModel(
        estimator=Lasso(alpha=0.025, random_state=42))
    X_new1 = transformer1.fit_transform(X, y)
    scores1 = np.abs(transformer1.estimator_.coef_)
    candidate_indices1 = np.argsort(-scores1, kind="mergesort")

    for n_features in range(1, X_new1.shape[1] + 1):
        transformer2 = SelectFromModel(
            estimator=Lasso(alpha=0.025, random_state=42),
            max_features=n_features,
            threshold=-np.inf,
        )
        X_new2 = transformer2.fit_transform(X, y)
        scores2 = np.abs(transformer2.estimator_.coef_)
        candidate_indices2 = np.argsort(-scores2, kind="mergesort")
        assert_allclose(X[:, candidate_indices1[:n_features]],
                        X[:, candidate_indices2[:n_features]])
    assert_allclose(transformer1.estimator_.coef_,
                    transformer2.estimator_.coef_)
Beispiel #31
0
def test_threshold_and_max_features():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    est = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer1 = SelectFromModel(estimator=est, max_features=3,
                                   threshold=-np.inf)
    X_new1 = transformer1.fit_transform(X, y)

    transformer2 = SelectFromModel(estimator=est, threshold=0.04)
    X_new2 = transformer2.fit_transform(X, y)

    transformer3 = SelectFromModel(estimator=est, max_features=3,
                                   threshold=0.04)
    X_new3 = transformer3.fit_transform(X, y)
    assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
    selected_indices = transformer3.transform(
        np.arange(X.shape[1])[np.newaxis, :])
    assert_allclose(X_new3, X[:, selected_indices[0]])
Beispiel #32
0
def test_max_features():
    # Test max_features parameter using various values
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    max_features = X.shape[1]
    est = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer1 = SelectFromModel(estimator=est,
                                   threshold=-np.inf)
    transformer2 = SelectFromModel(estimator=est,
                                   max_features=max_features,
                                   threshold=-np.inf)
    X_new1 = transformer1.fit_transform(X, y)
    X_new2 = transformer2.fit_transform(X, y)
    assert_allclose(X_new1, X_new2)

    # Test max_features against actual model.
    transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025,
                                                   random_state=42))
    X_new1 = transformer1.fit_transform(X, y)
    scores1 = np.abs(transformer1.estimator_.coef_)
    candidate_indices1 = np.argsort(-scores1, kind='mergesort')

    for n_features in range(1, X_new1.shape[1] + 1):
        transformer2 = SelectFromModel(estimator=Lasso(alpha=0.025,
                                       random_state=42),
                                       max_features=n_features,
                                       threshold=-np.inf)
        X_new2 = transformer2.fit_transform(X, y)
        scores2 = np.abs(transformer2.estimator_.coef_)
        candidate_indices2 = np.argsort(-scores2, kind='mergesort')
        assert_allclose(X[:, candidate_indices1[:n_features]],
                        X[:, candidate_indices2[:n_features]])
    assert_allclose(transformer1.estimator_.coef_,
                    transformer2.estimator_.coef_)
Beispiel #33
0
def test_max_features_tiebreak():
    # Test if max_features can break tie among feature importance
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    max_features = X.shape[1]

    feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
    for n_features in range(1, max_features + 1):
        transformer = SelectFromModel(
            FixedImportanceEstimator(feature_importances),
            max_features=n_features,
            threshold=-np.inf)
        X_new = transformer.fit_transform(X, y)
        selected_feature_indices = np.where(transformer._get_support_mask())[0]
        assert_array_equal(selected_feature_indices, np.arange(n_features))
        assert X_new.shape[1] == n_features
Beispiel #34
0
def logistic_l1(X, y, tol):
    DEBUG = False
    # if DEBUG: print 'X ',X,' y ',y,' tol ',tol
    lr = LogisticRegression(penalty='l1', C=0.65,
                            dual=False)
    model = SelectFromModel(lr,
                            prefit=False,
                            threshold=tol)
    if DEBUG: print X.shape, y.shape
    x_select = model.fit_transform(X, y)
    x_logreg = lr.fit(X, y)
    x_logreg_trans = lr.predict(X)
    x_irls = irls(X, y)
    support = model.get_support(indices=True)
    if DEBUG: print 'support', support, 'x_select', x_select, 'x_logreg', x_logreg_trans, 'x_irls', x_irls
    if DEBUG: print 'len_support', len(support), 'len_x_select', len(x_select), 'len_x_logreg', len(
        x_logreg_trans), 'len_x_irls', len(x_irls)
    if DEBUG: print 'x_logreg_coef', x_logreg.coef_, 'len', len(x_logreg.coef_[0]), 'intercept', x_logreg.intercept_

    return x_logreg.coef_[0]
def how_many_variables_used(word_list, inputs, outputs, num_vars, l1_step=LinearSVC(penalty='l1', dual=False, C=1)):
    kf = KFold(inputs.shape[0], n_folds=10, shuffle=True)
    for train_indices, val_indices in kf:
        # pipeline = Pipeline([('chi2_top_k', SelectKBest(chi2, num_vars)),
        #                      ('l1_step', SelectFromModel(l1_step))])
        kbest = SelectKBest(chi2, num_vars)
        l1_selector = SelectFromModel(l1_step)

        x_new = kbest.fit_transform(inputs[train_indices], outputs[train_indices].ravel())
        indices = kbest.get_support(indices=True)

        x_new = l1_selector.fit_transform(x_new, outputs[train_indices].ravel())
        new_indices = l1_selector.get_support(indices=True)

        from sklearn.ensemble import ExtraTreesClassifier
        model = ExtraTreesClassifier()
        model.fit(x_new, outputs[train_indices].ravel())
        importance = np.argsort(model.feature_importances_)[::-1]

        print([word_list[indices[i]] for i in new_indices])
        print([word_list[indices[new_indices[i]]] for i in importance])
        print(x_new.shape)
Beispiel #36
0
# -*- coding: utf-8 -*-

import pandas
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel

data = pandas.read_csv('D:\\PDM\\6.2\\data2.csv')

feature = data[['月份', '季度', '广告费用', '客流量']]

lrModel = LinearRegression()

selectFromModel = SelectFromModel(lrModel)

selectFromModel.fit_transform(
    feature, 
    data['销售额']
)

feature.columns[selectFromModel.get_support()]
#Function transform is deprecated; Support to use estimators as feature selectors will be removed in version 0.19. Use SelectFromModel instead.


from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

X_train = [
        [0,1,2],
        [3,5,4],
        [6,8,7],
        [9,11,10],
        [12,14,13],
        [15,16,17],
        [18,19,20],
        [21,22,23],
]

Y_train = [0,0,0,0,1,1,1,1]

rf=RandomForestClassifier(n_estimators=400, n_jobs=-1)
sf=SelectFromModel(rf)

Xr_train=sf.fit_transform(X_train,Y_train)

print(Xr_train)
Beispiel #38
0
            newFeatures.append(f[i] * f[j])
            newFeatures.append((f[i] + 0.1) / (f[j] + 0.1))
            newFeatures.append(f[i] - f[j])
            newFeatures.append(f[i] + f[j])
            newFeatures.append(f[i])
    features2.append(newFeatures)

target = results[:, 2]
weights = results[:, 3]

w = [t for t in results[:, 2] if t > 0]
clf = GradientBoostingRegressor(learning_rate=0.08, n_estimators=20, max_depth=40, min_samples_leaf=20)
clfR = GradientBoostingRegressor(learning_rate=0.08, n_estimators=120, max_depth=40, min_samples_leaf=20)
clffit = clf.fit(features2, target)
featuresSelectionModel = SelectFromModel(clffit)
features3 = featuresSelectionModel.fit_transform(features2, target)

features4 = []

for f in features3:
    newFeatures = []
    for i in range(len(f)):
        for j in range(i, len(f)):
            newFeatures.append(f[i] * f[j])
            newFeatures.append((f[i] + 0.2) / (f[j] + 0.2))
            newFeatures.append(f[i] - f[j])
            newFeatures.append(f[i] + f[j])
            newFeatures.append(f[i])
    features4.append(newFeatures)

clffit2 = clf.fit(features4, target)