def run(self, df, target_label):
     target = df[target_label]
     feature = df.drop(target_label, axis=1)
     clf = RandomizedLogisticRegression()
     for col in feature.columns:
         if np.any(np.isnan(feature[col].values)) or np.any(
                 np.isinf(feature[col].values)):
             print(list(feature[col].values))
     try:
         clf.fit(feature.values, target.values)
     except:
         for col in feature.columns:
             print(list(feature[col].values))
     scores = {}
     for col_index in range(len(feature.columns)):
         scores[feature.columns[col_index]] = abs(clf.scores_[col_index])
     scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     print(scores)
     position = {}
     i = 0
     for col, _ in scores:
         position[col] = i
         i += 1
     print(position)
     return position
Exemple #2
0
 def randomized_Logistic_regression(self):
     X = self.data[:, 1:len(self.data[0])]
     y = self.data[:, 0]
     randomized_logistic = RandomizedLogisticRegression()
     randomized_logistic.fit(X, y)
     a = randomized_logistic.get_support()
     selected = np.where(a)
Exemple #3
0
 def _get_clfs(self):
     clf_dict = {"rlrclf": RandomizedLogisticRegression(),
                 "rfclf": RandomForestClassifier(criterion='entropy'),
                 "dtrclf": DecisionTreeClassifier(criterion='entropy'),
                 "lrclf": LogisticRegression()
                 }
     return clf_dict
def statiblity(X, Y):
    from sklearn.linear_model import RandomizedLogisticRegression

    clf = RandomizedLogisticRegression(random_state=1)
    clf.fit(X, Y)

    return clf.scores_
Exemple #5
0
def Feature_sort(Feat_scale, Label, threads=4):  ##通过三种特征选择方法对特征进行排序

    ranks = {}
    ## Univariate feature selection
    Selector = SelectKBest(f_classif, k='all')
    Selector.fit_transform(Feat_scale, Label)
    ranks["Univariate_f"] = np.argsort(Selector.pvalues_)

    ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling
    ##从第1900左右起,后续的特征排序得较为可疑。
    rlogreg = RandomizedLogisticRegression(n_jobs=1,
                                           n_resampling=2000,
                                           selection_threshold=0,
                                           verbose=False,
                                           random_state=0)
    ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21.
    ##warnings.warn(msg, category=DeprecationWarning)
    rlogreg.fit(Feat_scale, Label)
    ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_))

    ## boruta based on randomforest n_jobs=**
    rf = RandomForestClassifier(random_state=0,
                                n_jobs=threads,
                                max_features='auto')
    feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0)
    feat_selector.fit(Feat_scale, Label)
    ranks["Boruta_f"] = np.argsort(feat_selector.ranking_)

    return (ranks)
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel':
            SelectFromModel(RandomForestClassifier(n_jobs=-1,
                                                   max_depth=10,
                                                   n_estimators=15),
                            threshold='20*mean'),
            'RFECV':
            RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect':
            GenericUnivariateSelect(),
            'RandomizedSparse':
            RandomizedLogisticRegression(),
            'KeepAll':
            'KeepAll'
        },
        'regressor': {
            'SelectFromModel':
            SelectFromModel(RandomForestRegressor(n_jobs=-1,
                                                  max_depth=10,
                                                  n_estimators=15),
                            threshold='0.7*mean'),
            'RFECV':
            RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect':
            GenericUnivariateSelect(),
            'RandomizedSparse':
            RandomizedLasso(),
            'KeepAll':
            'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
Exemple #7
0
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    # TODO(PRESTON): eventually let threshold be user-configurable (or grid_searchable)
    # TODO(PRESTON): optimize the params used here
    model_map = {
        'classifier': {
            'SelectFromModel':
            SelectFromModel(RandomForestClassifier(n_jobs=-1)),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1),
                           step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLogisticRegression(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel':
            SelectFromModel(RandomForestRegressor(n_jobs=-1)),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1),
                           step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLasso(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
Exemple #8
0
def feature_selection_class(predictors, responses, test_predictors,
                            selectFeatTech):
    if (selectFeatTech == 0):
        #t=int(predictors.shape[1]*0.40)
        t = 500  # no of features you want to select
        model = SelectKBest(chi2, k=t).fit(predictors.replace(-1, 0),
                                           responses)
        #print model.scores_
        predictors_new = model.transform(predictors)
        predictors_test_new = model.transform(test_predictors)
        indices = model.get_support(indices=True)
    if (selectFeatTech == 1):
        randomized_logistic = RandomizedLogisticRegression()
        model = randomized_logistic.fit(predictors, responses)
        predictors_new = model.transform(predictors)
        predictors_test_new = model.transform(test_predictors)
        indices = model.get_support(indices=True)

    column_names = predictors.columns[indices]
    predictors_new = pd.DataFrame(predictors_new,
                                  index=predictors.index,
                                  columns=column_names)
    predictors_test_new = pd.DataFrame(predictors_test_new,
                                       index=test_predictors.index,
                                       columns=column_names)
    return predictors_new, predictors_test_new
Exemple #9
0
def lasso_regression(X, y):
    """
	Use Randomized Logistic Regression to select the features based on the coefficient values
	"""

    clf = RandomizedLogisticRegression(C=1.0)
    clf.fit(X, y)
    print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0))
    imp_feature_idx = clf.scores_.argsort()

    qualities = []

    X_train, X_test, y_train, y_test = split_examples(X, y)

    for i in range(0, 100, 4):
        clf = LogisticRegression(C=0.1)
        clf.fit(X_train[:, imp_feature_idx[i:]], y_train)
        q = roc_auc_score(
            y_test,
            clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1])

        qualities.append(q)
    plt.plot(range(0, 100, 4), qualities)
    plt.show()

    return qualities
Exemple #10
0
def build_classifier(definition, datas):

    if definition['classification'] == 'lr':
        classifier = LogisticRegression(C=1.5)
    elif definition['classification'] == 'sgd':
        classifier = SGDClassifier(alpha=0.0001, n_iter=10**2)
    elif definition['classification'] == 'sgd_grid':

        best_params = grid_search_params(datas)
        classifier = SGDClassifier(n_iter=10**2, **best_params)

    rlr_feature_selection = RandomizedLogisticRegression(C=1.5,
                                                         n_jobs=-1,
                                                         verbose=0)

    # Standard sklearn classifier

    clf = Pipeline([
        #        ('string_encoder', pp_encode_strings),
        #        ('drop_nan_cols', pp_drop_nan_cols),
        #        ('fix_collinear', pp_fix_collinear),
        #
        #        ('float_imputer', pp_imputer),
        #        ('scaler', pp_scaler),
        # ('feature_selection', rlr_feature_selection),
        ('classification', classifier)
    ])

    return clf
Exemple #11
0
 def randlogistic(self, selection_threshold=0.25, sample_fraction=0.75):
     rlr_model = RandomizedLogisticRegression(
         C=self.C,
         selection_threshold=selection_threshold,
         normalize=False,
         sample_fraction=sample_fraction)
     rlr_model.fit(self.data.values, self.target.values)
     return rlr_model
Exemple #12
0
def evaluate_stability(vocab, id_to_vec, mesh_to_id):
    labels = ('Male', 'Female', 'Both')
    Xs, ids = get_basic_Xs(id_to_vec, mesh_to_id, shuffle=True)
    Xtr, Ytr, Itr, Xte, Yte, Ite = get_test_train(labels, ids, Xs, 5)
    print 'Fitting RandomizedLR...'
    logreg = RandomizedLogisticRegression(verbose=True,
                                          n_resampling=1000,
                                          n_jobs=16)
    logreg.fit(Xtr, Ytr)
    scores = logreg.scores_
    return {vocab[i]: score for i, score in enumerate(scores)}
def get_features(X_train, y_train, names, selection_threshold=0.2):
    print('\ngetting features with randomized logistic regression...')
    print('using a selection threshold of {}'.format(selection_threshold))
    randomized_logistic = RandomizedLogisticRegression(
        selection_threshold=selection_threshold)
    randomized_logistic.fit(X_train, y_train)
    mask = randomized_logistic.get_support()
    features = np.array(names)[mask]
    print('found {} ngrams:'.format(len([f for f in features])))
    print([f for f in features])
    return features
Exemple #14
0
def rank_features(algorithm, X, y):
    # The RFE approach can be used with various different classifiers
    if algorithm == 'random_forest_rfe':
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.feature_selection import RFE
        estimator = RandomForestClassifier(n_estimators=50,
                                           random_state=R_SEED,
                                           n_jobs=1)
        selector = RFE(estimator, 5, step=0.1)
        selector.fit(X, y)

        for x in sorted(
                zip(map(lambda x: round(x, 4), selector.ranking_), features)):
            print x[1]
    elif algorithm == 'svm_rfe':
        from sklearn.svm import SVC
        from sklearn.feature_selection import RFE
        estimator = SVC(random_state=R_SEED, kernel='linear')
        selector = RFE(estimator, 5, step=0.1)
        selector.fit(X, y)

        for x in sorted(
                zip(map(lambda x: round(x, 4), selector.ranking_), features)):
            print x[1]
    elif algorithm == 'random_logistic_regression':
        # See http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/
        from sklearn.linear_model import RandomizedLogisticRegression
        rlasso = RandomizedLogisticRegression(random_state=R_SEED)
        rlasso.fit(X, y)

        for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                            features),
                        reverse=True):
            print x[1]
    elif algorithm == 'random_lasso':
        from sklearn.linear_model import RandomizedLasso
        rlasso = RandomizedLasso(random_state=R_SEED)
        #rlasso = RandomizedLasso(alpha=0.025, random_state=R_SEED)
        rlasso.fit(X, y)

        for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                            features),
                        reverse=True):
            print x[1]
    elif algorithm == 'anova':
        from sklearn.feature_selection import f_classif
        F, pval = f_classif(X, y)
        random_array = random.random(len(pval))
        order = lexsort((random_array, pval))  # will break ties by random
        for i in order:
            print features[i]
    else:
        print "Invalid algorithm: %s" % algorithm
        exit(1)
def run_logreg(X_train, y_train, selection_threshold=0.2):
    print('\nrunning logistic regression...')
    print('using a selection threshold of {}'.format(selection_threshold))
    pipe = Pipeline([
        ('feature_selection', RandomizedLogisticRegression(
            selection_threshold=selection_threshold)),
        ('classification', LogisticRegression())
    ])
    pipe.fit(X_train, y_train)
    print('training accuracy : {}'.format(pipe.score(X_train, y_train)))
    print('testing accuracy : {}'.format(pipe.score(X_test, y_test)))
    return pipe
def feature_method_selection(data, label, fsname):
    """
    select features by option 'fsname'
    :param data:
    :param label:
    :param fsname:
    :return: new_data, selected data
    :return: selected_features_inx, the index of selected feature, starts with 0
    """
    if fsname == 'variance_threshold': #变化不大就舍弃,离散值
        model = VarianceThreshold() #th=1
        return model.fit_transform(data)

    elif fsname == 'select_kbest':
        model = SelectKBest(chi2, k=10) #特征值必须非负,chi2是分类

    elif fsname == 'rfe':#递归消除,耗时很长
        svc = SVC(kernel='linear', C=1)
        model = RFE(estimator=svc, n_features_to_select=10, step=1)

    elif fsname == 'rfecv': #交叉验证执行执行REF,label必须是数值
        svc = SVC(kernel="linear")
        rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(label, 1),
                      scoring='accuracy')

    elif fsname == 'RandLasso':#打乱重新选择,cannot perform reduce with flexible type
        model = RandomizedLogisticRegression()


    elif fsname == 'linear_svc':
        model = LinearSVC() #没有importance

    elif fsname == 'tree':
        model = ExtraTreesClassifier()

    elif fsname == 'fclassif':
        model = SelectFpr() #默认是f_classif,值越大,特征越有用

    elif fsname == 'pearsonr': #label必须是数值
        label = turn_label_2num(label)#结果是两个sample的相关性
        res = pearsonr(data,label)

    elif fsname == 'RandForReg': #label必须是数值
        label = turn_label_2num(label)
        model = RandomForestRegressor()

    else:
        logging.error('ERROR: feature selection option is wrong')

    model.fit(data, label)
    new_data = model.transform(data)  # selected importanted data

    return new_data
Exemple #17
0
def getElgiibleFeatures(allFeatureParam, allLabelParam):
    '''
    reff for paper : 
    http://scikit-learn.org/stable/modules/feature_selection.html#randomized-l1
    http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html
  '''

    logiRegObj = RandomizedLogisticRegression()
    logiRegObj.fit(allFeatureParam, allLabelParam)
    ### Output ###
    #print "Model score: ", logiRegObj.scores_
    eligible_indices = logiRegObj.get_support(indices=True)
    return eligible_indices
Exemple #18
0
def test_rflasso():
    train_X, test_X, train_Y, test_Y = train_test_split(index_data,
                                                        index_lable,
                                                        test_size=0.25,
                                                        random_state=1)
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_selection import SelectFromModel
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.linear_model import RandomizedLogisticRegression
    randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2)
    randomized_logistic.fit(train_X, train_Y)
    XX = randomized_logistic.transform(train_X)
    print XX.shape
	def set_data(self, user_atts, inter_atts, responses):
		self.build_data_representations(user_atts, inter_atts)
		# Convert from dict representation into matrix:
		predictor_rows = self.dict_vectorizer.fit_transform(self.dicts_rep).toarray()
		print(predictor_rows)
		print('Finding optimal feature set...')
		self.ff_model = RandomizedLogisticRegression() # Finds best set of features
		# Fit data and get transformed input rows:
		X_new = self.ff_model.fit_transform(predictor_rows, responses)
		print(X_new)
		print('Done! Final Shape: ' + str(X_new.shape))
		print('Building Final model...')		  
		self.model = LogisticRegression().fit(X_new, responses)
		print('Done!')
Exemple #20
0
def select_features(X, y):
    '''
    Select the relevant features from X that are useful for predicting
    the labels in y.

    Args:
        X: numpy 2D array containing input features
        y: numpy 1D array containing labels

    Returns:
        feature_list: List of indices of the selected important features
    '''

    # Get the selection model (stability selection)
    selection_model = RandomizedLogisticRegression(random_state=0)
    selection_model.fit(X, y)

    # Use a cross validated logistic regression to choose the importance
    # threshold at which a feature is included
    step_size = 50
    max_weight = int(max(selection_model.scores_)) + 1
    trial_thresholds = [
        i / step_size for i in range(1, max_weight * step_size + 1)
    ]
    threshold = 0
    max_score = 0
    for trial in trial_thresholds:
        selected_features = [
            i for i, score in enumerate(selection_model.scores_)
            if score > trial
        ]
        if len(selected_features) > 0:
            X_reduced = X[:, selected_features]
            model = LogisticRegression(multi_class='multinomial',
                                       class_weight='balanced',
                                       solver='newton-cg',
                                       random_state=0,
                                       max_iter=1000)
            scores = cross_val_score(model, X_reduced, y, cv=5)
            score = scores.mean()
            if score >= max_score:
                max_score = score
                threshold = trial / step_size

    importance = {i: s for i, s in enumerate(selection_model.scores_)}
    return [
        i for i, score in enumerate(selection_model.scores_)
        if score > threshold
    ]
Exemple #21
0
def stability_test(x, y, model, names, score_type):
    if score_type != "r2":
        rlasso = RandomizedLogisticRegression()
        rlasso.fit(x, y)
    else:
        rlasso = RandomizedLasso(alpha=0.025)
        rlasso.fit(x, y)
    if sum(rlasso.scores_) == 0:
        return [[0, el] for el in names]
    maxval = max(rlasso.scores_)
    minval = min(rlasso.scores_)
    dist = maxval - minval
    return list(
        zip(map(lambda x: round(x, 4), (rlasso.scores_ - minval) / dist),
            names))
Exemple #22
0
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech):
    if(selectFeatTech==0):
        t=int(predictors.shape[1]*0.40);
        t=40;
        model = SelectKBest(chi2, k=t).fit(predictors, responses);
        predictors_new = model.transform(predictors);
        predictors_test_new = model.transform(test_predictors);
        indices = model.get_support(indices=True);
    if(selectFeatTech==1):
        randomized_logistic = RandomizedLogisticRegression();
        model = randomized_logistic.fit(predictors, responses);
        predictors_new = model.transform(predictors);
        predictors_test_new = model.transform(test_predictors);
        indices = model.get_support(indices=True);
    return predictors_new, predictors_test_new, indices;
def run_logreg(X_train, y_train, selection_threshold=0.2):
    print("\nrunning logistic regression...")
    print("using a selection threshold of {}".format(selection_threshold))
    pipe = Pipeline(
        [
            (
                "feature_selection",
                RandomizedLogisticRegression(selection_threshold=selection_threshold),
            ),
            ("classification", LogisticRegression()),
        ]
    )
    pipe.fit(X_train, y_train)
    print("training accuracy : {}".format(pipe.score(X_train, y_train)))
    print("testing accuracy : {}".format(pipe.score(X_test, y_test)))
    return pipe
Exemple #24
0
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state):
    """
    Feature selection based on the scores given to the features by the 
    RandomizedLogisticRegression algorithm.
    """

    rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.],
                                       sample_fraction=0.7,
                                       n_resampling=200,
                                       selection_threshold=0.25,
                                       verbose=5,
                                       n_jobs=-1,
                                       random_state=0)
    rlr.fit(X_train, y_train)
    np.save('save/feat_sel_log_reg.npy', rlr.scores_)

    return rlr.scores_
Exemple #25
0
    def rank_random_logistic_regression(self,
                                        features_indep_df: PandasDataFrame,
                                        feature_target: List,
                                        n_jobs: int = -1,
                                        **kwargs: Any) -> object:
        """Use Randomized Logistic Regression to rank features.
        Attributes:
        model.scores_
        model.all_scores_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
        :param kwargs: C=1, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, tol=0.001,
        fit_intercept=True, verbose=False, normalize=True, random_state=None, pre_dispatch='3*n_jobs'
        :return: the importance ranking model.
        """
        self.__logger.debug("Run Random Logistic Regression.")
        classifier = RandomizedLogisticRegression(n_jobs=n_jobs, **kwargs)
        return classifier.fit(features_indep_df, feature_target)
Exemple #26
0
    def _run_randomized_regression(self,
                                   feature_df,
                                   annotation,
                                   clinical_column,
                                   sample_fraction=0.7):
        annotation = copy.deepcopy(annotation)
        # Encode labels of the classes
        le = preprocessing.LabelEncoder()
        annotation[clinical_column] = le.fit_transform(
            annotation[clinical_column])

        clf = RandomizedLogisticRegression(
            n_resampling=self.rr_iterations,
            sample_fraction=sample_fraction,
            n_jobs=1,
            verbose=1,
        ).fit(feature_df, annotation[clinical_column])

        selected_features = feature_df.T[clf.scores_ != 0].index
        logger.info("Number of selected features: %d", len(selected_features))
        return selected_features, clf
def rdlg_variables(X, y, threshold=0.25):#默认阈值0.25
    """
    #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。
    #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果
    #不同的子集上建立模型,然后汇总最终确定特征得分
    稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。
    它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
    比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。
    理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。
    总的来说,好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。
    对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一。
    """

    rlr = RandomizedLogisticRegression(selection_threshold = threshold)  #随机逻辑回归
    rlr.fit(X, y)
    scoretable = pd.DataFrame(rlr.all_scores_, index = X.columns) #汇总最终确定特征得分
    scoretable = scoretable.reset_index()    
    scoretable = scoretable.rename(columns = {'index':'Col', 0:'value_retio'}, copy = False)    
    df_score = scoretable[scoretable.value_retio > threshold] #删掉缺失值<0.25的数据   
    refesh_data = X[list(df_score['Col'])] 
         
    return scoretable,refesh_data
Exemple #28
0
 def pick_variables(self,
                    descover=True,
                    method="rlr",
                    threshold=0.25,
                    auto_pick=True):  #默认阈值0.25
     #挑选变量助手(特征选择)
     if method == "rlr":
         """
         #顶层特征选择算法
         #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。
         #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果
         #不同的子集上建立模型,然后汇总最终确定特征得分
         稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。
         它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
         比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。
         理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。
         RandomizedLogisticRegression()
         fit(X, y)	Fit the model using X, y as training data.
         fit_transform(X[, y])	Fit to data, then transform it.
         get_params([deep])	Get parameters for this estimator.
         get_support([indices])	Get a mask, or integer index, of the features selected
         inverse_transform(X)	Reverse the transformation operation
         set_params(**params)	Set the parameters of this estimator.
         transform(X)	Reduce X to the selected features.
         """
         rlr = RandomizedLogisticRegression(
             selection_threshold=threshold)  #随机逻辑回归
         rlr.fit(self.X_train, self.y_train)
         scoretable = pd.DataFrame(rlr.all_scores_,
                                   index=self.X_train.columns,
                                   columns=['var_score'])  #汇总最终确定特征得分
         columns_need = list(self.X_train.columns[rlr.get_support(
         )])  #	Get a mask, or integer index, of the features selected
         self.X_train = self.X_train[columns_need]
         self.X_test = self.X_test[columns_need]
         columns_need.append("y")
         if auto_pick:
             self.picked_data = self.data[columns_need]
         return scoretable
Exemple #29
0
sys.stderr.write("# of initial features: %d\n" % (len(registered_feat_names)))
sys.stderr.write("# of transformed features: %d\n" % (len(Xtr.toarray()[0])))

sel = None
n = len(feature_names)

if (selector == "kbest"):
    sel = SelectKBest(chi2, k=n)
elif (selector == "kbest_anova"):
    sel = SelectKBest(f_classif, k=n)
elif (selector == "rfecv"):
    sel = RFECV()
elif (selector == "lasso"):
    sel = SelectFromModel(LassoCV(), threshold=0.005)
elif (selector == "rlregr"):
    sel = RandomizedLogisticRegression()
elif (selector == "svm"):
    sel = eval( "SelectFromModel(LinearSVC(%s))" % (args.selector_params) )
elif (selector == "extra_trees"):
    sel = SelectFromModel(ExtraTreesClassifier())
elif (selector == "random_forest"):
    sel = SelectFromModel(RandomForestClassifier())

print sel.estimator
if (type(sel) == SelectFromModel and type(sel.estimator) == LassoCV):
    sel.fit(Xtr, Ytr)
    top_ranked = sorted(enumerate(sel.estimator_.coef_), key=lambda x:x[1], reverse=True)
    top_indices = map(list,zip(*top_ranked))[0]
    for feat,pval in zip(np.asarray(vectorizer.get_feature_names())[top_indices],sel.estimator_.coef_[top_indices]):
        print "%s\t%s" % (feat, pval)
elif (type(sel) == SelectFromModel and (type(sel.estimator) == ExtraTreesClassifier or type(sel.estimator) == RandomForestClassifier)):
Exemple #30
0
    SelectFwe,  # TODO: add tests and document
    GenericUnivariateSelect,
    VarianceThreshold,
    RFE,
    RFECV,
    SelectFromModel,
)
from sklearn.linear_model import LogisticRegression
_additional_test_cases = []
try:
    from sklearn.linear_model import (  # type: ignore
        RandomizedLogisticRegression,
        RandomizedLasso,  # TODO: add tests and document
    )
    _additional_test_cases.append(
        (RandomizedLogisticRegression(random_state=42),
         ['<NAME1>', '<NAME2>', '<NAME3>']))
except ImportError:  # Removed in scikit-learn 0.21
    pass
from sklearn.preprocessing import (
    MinMaxScaler,
    StandardScaler,
    MaxAbsScaler,
    RobustScaler,
)
from sklearn.pipeline import FeatureUnion, make_pipeline

from eli5 import transform_feature_names
from eli5.sklearn import PermutationImportance