def run(self, df, target_label): target = df[target_label] feature = df.drop(target_label, axis=1) clf = RandomizedLogisticRegression() for col in feature.columns: if np.any(np.isnan(feature[col].values)) or np.any( np.isinf(feature[col].values)): print(list(feature[col].values)) try: clf.fit(feature.values, target.values) except: for col in feature.columns: print(list(feature[col].values)) scores = {} for col_index in range(len(feature.columns)): scores[feature.columns[col_index]] = abs(clf.scores_[col_index]) scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) print(scores) position = {} i = 0 for col, _ in scores: position[col] = i i += 1 print(position) return position
def statiblity(X, Y): from sklearn.linear_model import RandomizedLogisticRegression clf = RandomizedLogisticRegression(random_state=1) clf.fit(X, Y) return clf.scores_
def feture_select_RLR(): data_x, data_y, names = get_data() rlr = RLR() rlr.fit(data_x, data_y) return sorted(zip(names, map(lambda x: round(x, 4), rlr.scores_)), key=lambda x: x[1], reverse=True)
def lasso_regression(X, y): """ Use Randomized Logistic Regression to select the features based on the coefficient values """ clf = RandomizedLogisticRegression(C=1.0) clf.fit(X, y) print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0)) imp_feature_idx = clf.scores_.argsort() qualities = [] X_train, X_test, y_train, y_test = split_examples(X, y) for i in range(0, 100, 4): clf = LogisticRegression(C=0.1) clf.fit(X_train[:, imp_feature_idx[i:]], y_train) q = roc_auc_score( y_test, clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1]) qualities.append(q) plt.plot(range(0, 100, 4), qualities) plt.show() return qualities
def randomized_Logistic_regression(self): X = self.data[:, 1:len(self.data[0])] y = self.data[:, 0] randomized_logistic = RandomizedLogisticRegression() randomized_logistic.fit(X, y) a = randomized_logistic.get_support() selected = np.where(a)
def Feature_sort(Feat_scale, Label, threads=4): ##通过三种特征选择方法对特征进行排序 ranks = {} ## Univariate feature selection Selector = SelectKBest(f_classif, k='all') Selector.fit_transform(Feat_scale, Label) ranks["Univariate_f"] = np.argsort(Selector.pvalues_) ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling ##从第1900左右起,后续的特征排序得较为可疑。 rlogreg = RandomizedLogisticRegression(n_jobs=1, n_resampling=2000, selection_threshold=0, verbose=False, random_state=0) ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21. ##warnings.warn(msg, category=DeprecationWarning) rlogreg.fit(Feat_scale, Label) ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_)) ## boruta based on randomforest n_jobs=** rf = RandomForestClassifier(random_state=0, n_jobs=threads, max_features='auto') feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0) feat_selector.fit(Feat_scale, Label) ranks["Boruta_f"] = np.argsort(feat_selector.ranking_) return (ranks)
def randomized_Logistic_regression(self): X = self.data[:,1:len(self.data[0])] y = self.data[:,0] randomized_logistic = RandomizedLogisticRegression() randomized_logistic.fit(X,y) a = randomized_logistic.get_support() selected = np.where(a)
def feature_selection(train,test,y): print "特征选择" clf = RLR(C=10,scaling=0.5,sample_fraction=0.6,n_resampling=200,selection_threshold=0.4,n_jobs=3) clf.fit(train,y) train = clf.transform(train) test = clf.transform(test) return train,test
def randlogistic(self, selection_threshold=0.25, sample_fraction=0.75): rlr_model = RandomizedLogisticRegression( C=self.C, selection_threshold=selection_threshold, normalize=False, sample_fraction=sample_fraction) rlr_model.fit(self.data.values, self.target.values) return rlr_model
def evaluate_stability(vocab, id_to_vec, mesh_to_id): labels = ('Male', 'Female', 'Both') Xs, ids = get_basic_Xs(id_to_vec, mesh_to_id, shuffle=True) Xtr, Ytr, Itr, Xte, Yte, Ite = get_test_train(labels, ids, Xs, 5) print 'Fitting RandomizedLR...' logreg = RandomizedLogisticRegression(verbose=True, n_resampling=1000, n_jobs=16) logreg.fit(Xtr, Ytr) scores = logreg.scores_ return {vocab[i]: score for i, score in enumerate(scores)}
def rank_features(algorithm, X, y): # The RFE approach can be used with various different classifiers if algorithm == 'random_forest_rfe': from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFE estimator = RandomForestClassifier(n_estimators=50, random_state=R_SEED, n_jobs=1) selector = RFE(estimator, 5, step=0.1) selector.fit(X, y) for x in sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features)): print x[1] elif algorithm == 'svm_rfe': from sklearn.svm import SVC from sklearn.feature_selection import RFE estimator = SVC(random_state=R_SEED, kernel='linear') selector = RFE(estimator, 5, step=0.1) selector.fit(X, y) for x in sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features)): print x[1] elif algorithm == 'random_logistic_regression': # See http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/ from sklearn.linear_model import RandomizedLogisticRegression rlasso = RandomizedLogisticRegression(random_state=R_SEED) rlasso.fit(X, y) for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), features), reverse=True): print x[1] elif algorithm == 'random_lasso': from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(random_state=R_SEED) #rlasso = RandomizedLasso(alpha=0.025, random_state=R_SEED) rlasso.fit(X, y) for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), features), reverse=True): print x[1] elif algorithm == 'anova': from sklearn.feature_selection import f_classif F, pval = f_classif(X, y) random_array = random.random(len(pval)) order = lexsort((random_array, pval)) # will break ties by random for i in order: print features[i] else: print "Invalid algorithm: %s" % algorithm exit(1)
def get_features(X_train, y_train, names, selection_threshold=0.2): print('\ngetting features with randomized logistic regression...') print('using a selection threshold of {}'.format(selection_threshold)) randomized_logistic = RandomizedLogisticRegression( selection_threshold=selection_threshold) randomized_logistic.fit(X_train, y_train) mask = randomized_logistic.get_support() features = np.array(names)[mask] print('found {} ngrams:'.format(len([f for f in features]))) print([f for f in features]) return features
def get_support_fields(X,Y): ''' Function for getting support fields ''' rlr = RLR() #建立随机逻辑回归模型,筛选变量 rlr.fit(X, Y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print rlr.scores_ print(u'有效特征为:%s' % (','.join(data.columns[rlr.get_support()])).decode('utf-8')) X = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征 return X
def predictWithAdaBoost(config, X, Y, testFeatures): adaConfig = config.getConfig('model/adaboost') if adaConfig.get('useRandomLog', False): clf = RandomizedLogisticRegression() clf.fit(X, Y) X_new = clf.transform(X) if not X_new.size == 0: X = X_new testFeatures = clf.transform(testFeatures) clf = AdaBoostClassifier(n_estimators=50,learning_rate=1.0, algorithm='SAMME.R') clf.fit(X,Y) return clf.predict(testFeatures)
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'): # print "\t\t\tfilling nan values..." df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered.dropna(how='all') x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values clf = RandomizedLogisticRegression() # print "\t\t\tfitting LR model..." clf.fit(x_imp.T, y_v) feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance']) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances
def classify_logistic(): print "logistic regression" (X_train, y_train), (X_test, y_test) = util.load_all_feat() print "original X_train shape", X_train.shape clf = RandomizedLogisticRegression(n_jobs=2) clf.fit(X_train, y_train) # clf = LogisticRegression() # clf.fit(X_train, y_train) pred = clf.predict(X_test) print "accuracy score:", accuracy_score(y_test, pred) import ipdb; ipdb.set_trace() # XXX BREAKPOINT
def getElgiibleFeatures(allFeatureParam, allLabelParam): ''' reff for paper : http://scikit-learn.org/stable/modules/feature_selection.html#randomized-l1 http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html ''' logiRegObj = RandomizedLogisticRegression() logiRegObj.fit(allFeatureParam, allLabelParam) ### Output ### #print "Model score: ", logiRegObj.scores_ eligible_indices = logiRegObj.get_support(indices=True) return eligible_indices
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state): """ Feature selection based on the scores given to the features by the RandomizedLogisticRegression algorithm. """ rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.], sample_fraction=0.7, n_resampling=200, selection_threshold=0.25, verbose=5, n_jobs=-1, random_state=0) rlr.fit(X_train, y_train) np.save('save/feat_sel_log_reg.npy', rlr.scores_) return rlr.scores_
def predictWithQDA(config, X, Y, testFeatures): qdaConfig = config.getConfig('model/qda') if qdaConfig.get('useRandomLog', False): clf = RandomizedLogisticRegression() clf.fit(X, Y) X_new = clf.transform(X) if not X_new.size == 0: X = X_new testFeatures = clf.transform(testFeatures) priors = qdaConfig.get('priors', None) clf = QDA(priors = priors) clf.fit(X, Y) return clf.predict(testFeatures)
def test_rflasso(): train_X, test_X, train_Y, test_Y = train_test_split(index_data, index_lable, test_size=0.25, random_state=1) from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import RandomizedLogisticRegression randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2) randomized_logistic.fit(train_X, train_Y) XX = randomized_logistic.transform(train_X) print XX.shape
def select_features(X, y): ''' Select the relevant features from X that are useful for predicting the labels in y. Args: X: numpy 2D array containing input features y: numpy 1D array containing labels Returns: feature_list: List of indices of the selected important features ''' # Get the selection model (stability selection) selection_model = RandomizedLogisticRegression(random_state=0) selection_model.fit(X, y) # Use a cross validated logistic regression to choose the importance # threshold at which a feature is included step_size = 50 max_weight = int(max(selection_model.scores_)) + 1 trial_thresholds = [ i / step_size for i in range(1, max_weight * step_size + 1) ] threshold = 0 max_score = 0 for trial in trial_thresholds: selected_features = [ i for i, score in enumerate(selection_model.scores_) if score > trial ] if len(selected_features) > 0: X_reduced = X[:, selected_features] model = LogisticRegression(multi_class='multinomial', class_weight='balanced', solver='newton-cg', random_state=0, max_iter=1000) scores = cross_val_score(model, X_reduced, y, cv=5) score = scores.mean() if score >= max_score: max_score = score threshold = trial / step_size importance = {i: s for i, s in enumerate(selection_model.scores_)} return [ i for i, score in enumerate(selection_model.scores_) if score > threshold ]
def learning_curves(X, y, clf, params, train_sizes=None, feature_selection=False, n_folds=3, scoring='accuracy'): """ Builds learning curves on test set, with parameters chosen on train and validation set using nested cross validation :param X: data :param y: labels :param clf: classificator :param params: parameters for grid search :param train_sizes: train sizes for building learning curves :param feature_selection: whether to choose features by randomized logistic regression :param n_folds: number of outed cv folds :param scoring: scoring metric :return: train and test curve """ if train_sizes is None: train_sizes = np.linspace(0.5, 1.0, 5) kf = KFold(X.shape[0], n_folds=n_folds) train_curve = np.zeros_like(train_sizes) test_curve = np.zeros_like(train_sizes) for train_inds, test_inds in kf: train_data = X[train_inds] test_data = X[test_inds] train_labels = y[train_inds] test_labels = y[test_inds] if feature_selection: rlr = RandomizedLogisticRegression() rlr.fit(train_data, train_labels) inds = [i for i in range(X.shape[1]) if rlr.all_scores_[i] > 0.0] print len(inds), ' features chosen' train_data = train_data[:, inds] gs = GridSearchCV(clf, params, scoring=scoring, cv=5) gs.fit(train_data, train_labels) bp = gs.best_params_ print 'chosen params: ', bp for p in bp: setattr(clf, p, bp[p]) lc = learning_curve(clf, test_data, test_labels, scoring=scoring, train_sizes=train_sizes) train_curve += lc[1].mean(axis=1) test_curve += lc[2].mean(axis=1) train_curve /= n_folds test_curve /= n_folds return train_curve, test_curve
def stability_test(x, y, model, names, score_type): if score_type != "r2": rlasso = RandomizedLogisticRegression() rlasso.fit(x, y) else: rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(x, y) if sum(rlasso.scores_) == 0: return [[0, el] for el in names] maxval = max(rlasso.scores_) minval = min(rlasso.scores_) dist = maxval - minval return list( zip(map(lambda x: round(x, 4), (rlasso.scores_ - minval) / dist), names))
def feature_selection_class(predictors, responses, test_predictors, selectFeatTech): if (selectFeatTech == 0): #t=int(predictors.shape[1]*0.40) t = 500 # no of features you want to select model = SelectKBest(chi2, k=t).fit(predictors.replace(-1, 0), responses) #print model.scores_ predictors_new = model.transform(predictors) predictors_test_new = model.transform(test_predictors) indices = model.get_support(indices=True) if (selectFeatTech == 1): randomized_logistic = RandomizedLogisticRegression() model = randomized_logistic.fit(predictors, responses) predictors_new = model.transform(predictors) predictors_test_new = model.transform(test_predictors) indices = model.get_support(indices=True) column_names = predictors.columns[indices] predictors_new = pd.DataFrame(predictors_new, index=predictors.index, columns=column_names) predictors_test_new = pd.DataFrame(predictors_test_new, index=test_predictors.index, columns=column_names) return predictors_new, predictors_test_new
def perform_stability_selection(X_train, y_train, round_id = 0) : # Defaults: RandomizedLasso(alpha='aic', scaling=0.5, sample_fraction=0.75, n_resampling=200, n_jobs = 1) X_train = perform_scaling (X_train, scaling = 'minmax') #logistic = LogisticRegression(penalty = 'l2', class_weight = 'auto', max_iter = 1000, random_state = 30) #logistic.fit(X_train, y_train) print ("Round%d - Stability selection -" %(round_id)) #print ("Logistic (L1 penalty) Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), logistic.coef_), header[1:]), # reverse=True)) #print ("Logistic Feature_Importances: ", logistic.coef_) rlog = RandomizedLogisticRegression(random_state = 30, n_jobs = 3, n_resampling = 400) rlog.fit(X_train, y_train) print ("Randomized Logistic Feature_Importances: ", rlog.scores_) print ("Randomized Logistic Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), rlog.scores_), header[1:]), reverse=True))
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state): """ Feature selection based on the scores given to the features by the RandomizedLogisticRegression algorithm. """ rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.], sample_fraction=0.7, n_resampling=200, selection_threshold=0.25, verbose=5, n_jobs=-1, random_state=0) rlr.fit(X_train, y_train) np.save('save/feat_sel_log_reg.npy', rlr.scores_) return rlr.scores_
def tipdm_chapter5_test(): # 参数初始化 filename = '../../../MyFile/chapter5/data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() # feature selection rlr = RLR() # 建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) # 训练模型 features = rlr.get_support() # 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为: {0}'.format(','.join(data.columns[features]))) x = data[data.columns[features]].as_matrix() # 筛选好特征 # training and test lr = LR() # 建立逻辑货柜模型 lr.fit(x, y) # 用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为: {0}'.format(lr.score(x, y))) # 给出模型的平均正确率
def programmer_1(): filename = "data/bankloan.xls" data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() rlr = RLR() rlr.fit(x, y) rlr_support = rlr.get_support() support_col = data.drop('违约', axis=1).columns[rlr_support] print( "rlr_support_columns: {columns}".format(columns=','.join(support_col))) x = data[support_col].as_matrix() lr = LR() lr.fit(x, y) print("lr: {score}".format(score=lr.score(x, y)))
def data_proc(self): self.load_data() # iloc,完全基于位置的索引,[]中的第一个值是从第几行到第几行,第二个是从第几列到第几列 x = self.data.iloc[:, :8].as_matrix() y = self.data.iloc[:, 8].as_matrix() #先使用随机变量模型进行属性的筛选 rlr = RLR() rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores获得各个特征的分数 print("有效特征为%s" % ','.join(self.data.columns[rlr.get_support()])) x = self.data[data.columns[rlr.get_support()]].as_matrix() #筛选之后的特征 rlr.get_support() lr = LR(class_weight={ 0: 0.9, 1: 0.1 }) # 分类权重,避免误分类代价比较高时使用,class_weight='balanced'自行处理,或者像代码中那样设置 #lr.fit(x, y,sample_weight=[1,2,3,5,4,9,8,10]) lr.fit(x, y, sample_weight=[1, 2, 3, 5, 4]) #样本权重,设置每一行数据的重要性,一行数据一个值 result = lr.predict([[24, 2, 2, 0, 28, 17.3, 1.79, 3.06]]) print('模型的正确率是:%s,预测结果是 %d' % (lr.score(x, y), result))
def stable_select(df, y, rd_reg_columns, threshold=0.2, model='rlr'): X = df.loc[:, rd_reg_columns] Y = df[y] if model == 'rlr': rlr = RLR(scaling=0.5, sample_fraction=0.75, n_resampling=300, selection_threshold=threshold) # 随机逻辑回归 rlr.fit(X, Y) scores = rlr.scores_ elif model == 'rls': rls = RLS(scaling=0.5, sample_fraction=0.75, n_resampling=300, selection_threshold=threshold) # 随机Lasso回归 rls.fit(X, Y) scores = rls.scores_ elif model == 'rfr': rf = RFR() rf.fit(X, Y) scores = rf.feature_importances_ else: pass result = pd.Series(dict(zip(X.columns, scores))).rename('score').sort_values(ascending=False) plt.figure(figsize=(20, 10)) result.plot.barh(title='Feature Importances', color='lightblue') plt.ylabel('Feature Importance Score') return result
def randomlr(train_x,train_y,cv_x,test_x,regp,alpha=0.5): # Create the random forest object which will include all the parameters # for the fit randomlr = RandomizedLogisticRegression(C=regp,scaling=alpha,fit_intercept=True,sample_fraction=0.75,n_resampling=200) # Fit the training data to the Survived labels and create the decision trees randomlr = randomlr.fit(train_x,train_y) train_x = randomlr.fit_transform(train_x,train_y) cv_x = randomlr.transform(cv_x) test_x = randomlr.transform(test_x) return train_x,cv_x,test_x
def pick_variables(self, descover=True, method="rlr", threshold=0.25, auto_pick=True): #默认阈值0.25 #挑选变量助手(特征选择) if method == "rlr": """ #顶层特征选择算法 #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。 #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果 #不同的子集上建立模型,然后汇总最终确定特征得分 稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。 它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, 比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。 理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。 RandomizedLogisticRegression() fit(X, y) Fit the model using X, y as training data. fit_transform(X[, y]) Fit to data, then transform it. get_params([deep]) Get parameters for this estimator. get_support([indices]) Get a mask, or integer index, of the features selected inverse_transform(X) Reverse the transformation operation set_params(**params) Set the parameters of this estimator. transform(X) Reduce X to the selected features. """ rlr = RandomizedLogisticRegression( selection_threshold=threshold) #随机逻辑回归 rlr.fit(self.X_train, self.y_train) scoretable = pd.DataFrame(rlr.all_scores_, index=self.X_train.columns, columns=['var_score']) #汇总最终确定特征得分 columns_need = list(self.X_train.columns[rlr.get_support( )]) # Get a mask, or integer index, of the features selected self.X_train = self.X_train[columns_need] self.X_test = self.X_test[columns_need] columns_need.append("y") if auto_pick: self.picked_data = self.data[columns_need] return scoretable
def programmer_1(): # 参数初始化 filename = r'bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() # 使用pandas读取文件 就可以不用管label column标签 y = data.iloc[:, 8].as_matrix() rlr = RLR() # 建立随机逻辑回归模型,进行特征选择和变量筛选 rlr.fit(x, y) # 训练模型 egeList = rlr.get_support() # 获取筛选后的特征 egeList = np.append( egeList, False) # 往numpy数组中 添加一个False元素 使用np.append(array,ele)方法 print("rlr.get_support():") print(egeList) print(u'随机逻辑回归模型特征选择结束!!!') print(u'有效特征为:%s' % ','.join(data.columns[egeList])) x = data[data.columns[egeList]].as_matrix() # 筛选好特征值 lr = LR() # 建立逻辑回归模型 lr.fit(x, y) # 用筛选后的特征进行训练 print(u'逻辑回归训练模型结束!!!') print(u'模型的平均正确率:%s' % lr.score(x, y)) # 给出模型的平均正确率,本例为81.4%
def rdlg_variables(X, y, threshold=0.25):#默认阈值0.25 """ #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。 #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果 #不同的子集上建立模型,然后汇总最终确定特征得分 稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。 它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, 比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。 理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。 总的来说,好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。 对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一。 """ rlr = RandomizedLogisticRegression(selection_threshold = threshold) #随机逻辑回归 rlr.fit(X, y) scoretable = pd.DataFrame(rlr.all_scores_, index = X.columns) #汇总最终确定特征得分 scoretable = scoretable.reset_index() scoretable = scoretable.rename(columns = {'index':'Col', 0:'value_retio'}, copy = False) df_score = scoretable[scoretable.value_retio > threshold] #删掉缺失值<0.25的数据 refesh_data = X[list(df_score['Col'])] return scoretable,refesh_data
def logistic(X_train, X_test, y_train, y_test): from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR #特征工程 rlr = RLR() rlr.fit(X_train, y_train) print(rlr.get_support()) x = X_train[X_train.columns[rlr.get_support()]].as_matrix() x_test = X_test[X_test.columns[rlr.get_support()]].as_matrix() ''' x=X_train x_test=X_test ''' #逻辑回归 lr = LR() lr.fit(x, y_train) pred_prob_train = lr.predict_proba(x) pred_prob = lr.predict_proba(x_test) print('logistic') predicts = lr.predict(x_test) metrics_result(y_test, predicts) return pred_prob, pred_prob_train
def logistic_regression(): # 参数初始化 filename = SRC_PATH + '/data/bankloan.xls' data = pd.read_excel(filename) print data.head() print data.tail() x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() print x, y rlr = RLR() # 建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) # 训练模型 rlr.get_support() # 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') # print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) # x = data[data.columns[rlr.get_support()]].as_matrix() # 筛选好特征 lr = LR() # 建立逻辑货柜模型 lr.fit(x, y) # 用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y)) # 给出模型的平均正确率,本例为81.4%
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech): if(selectFeatTech==0): t=int(predictors.shape[1]*0.40); t=40; model = SelectKBest(chi2, k=t).fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); if(selectFeatTech==1): randomized_logistic = RandomizedLogisticRegression(); model = randomized_logistic.fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); return predictors_new, predictors_test_new, indices;
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech): if(selectFeatTech==0): t=int(predictors.shape[1]*0.40); t=40; model = SelectKBest(chi2, k=t).fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); if(selectFeatTech==1): randomized_logistic = RandomizedLogisticRegression(); model = randomized_logistic.fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); return predictors_new, predictors_test_new, indices;
def lasso_regression(X, y): """ Use Randomized Logistic Regression to select the features based on the coefficient values """ clf = RandomizedLogisticRegression(C=1.0) clf.fit(X, y) print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0)) imp_feature_idx = clf.scores_.argsort() qualities = [] X_train, X_test, y_train, y_test = split_examples(X, y) for i in range(0, 100, 4): clf = LogisticRegression(C=0.1) clf.fit(X_train[:, imp_feature_idx[i:]], y_train) q = roc_auc_score(y_test, clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1]) qualities.append(q) plt.plot(range(0, 100, 4), qualities) plt.show() return qualities
def select_top_features(X, y, ques_dict, top_count=7): ''' Run RandomizedLogisticRegression and return top number of features Args: X(dataframe) -- features y(dataframe) -- outcome ques_dict(dict) -- variable name to questions dictionary top_count(int) -- number of top features to return ''' rand_log = RandomizedLogisticRegression() X_feat = rand_log.fit(X, y) questions = features_to_questions(X.columns, ques_dict) all_features = sorted(zip(questions, X_feat.scores_), key=lambda tup: tup[1], reverse=True) top_features = [f for f in all_features if f[1] > 0][:top_count] return top_features
def rank_random_logistic_regression(self, features_indep_df: PandasDataFrame, feature_target: List, n_jobs: int = -1, **kwargs: Any) -> object: """Use Randomized Logistic Regression to rank features. Attributes: model.scores_ model.all_scores_ :param features_indep_df: the independent features, which are inputted into the model. :param feature_target: the target feature, which is being estimated. :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs. :param kwargs: C=1, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, tol=0.001, fit_intercept=True, verbose=False, normalize=True, random_state=None, pre_dispatch='3*n_jobs' :return: the importance ranking model. """ self.__logger.debug("Run Random Logistic Regression.") classifier = RandomizedLogisticRegression(n_jobs=n_jobs, **kwargs) return classifier.fit(features_indep_df, feature_target)
#-*- coding:utf-8 -*- # Peishichao import pandas as pd filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() rlr.fit(x, y) rlr.get_support() print(rlr.get_support()) print('end') #print('Feature: %s ' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() print(x) lr = LR() lr.fit(x, y) print('end') print('accur: %s' % lr.score(x, y))
# -*- coding:utf-8 -*- # 逻辑回归:自动建模 import pandas as pd from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR data = pd.read_excel("c://mldata//bankloan.xls", header=0) # x = data.iloc[:, :8].as_matrix() # y = data.iloc[:, 8].as_matrix() 和下边的两种读取数据的方式,都会带来精度的影响 train_data = data.values # 将读取的数据其转换为矩阵形式 train_x = train_data[0::, :8] train_label = train_data[0::, 8] rlr = RLR() # 建立随机回归模型,筛选变量 rlr.fit(train_x, train_label) # 训练模型 rlr.get_support() # 获取特征筛选结果 print u"特征筛选结束" print u"有效特征为:%s" % u'、'.join(data.columns[rlr.get_support()]) x = data[data.columns[rlr.get_support()]].as_matrix() # 筛选好的特征 lr = LR() lr.fit(x, train_label) # 用筛选好的特征数据来训练模型 print u'逻辑回归训练结束' print u'模型的平均正确率为:%s' % lr.score(x, train_label)
from __future__ import division import numpy as np from sklearn.linear_model import RandomizedLogisticRegression from sklearn.linear_model import LogisticRegression X = np.load("../feats/train_formatted.npy") y = np.load("../feats/train_y.npy") X_test = np.load("../feats/test_formatted.npy") y_test = np.load("../feats/test_y.npy") clf = RandomizedLogisticRegression() clf.fit(X, y) scores = clf.scores_ print 'Index : score' sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True)] top = 30 for i in range(top): print str(sortedIdx[i]) + ' : ' + str(scores[sortedIdx[i]]) lr = LogisticRegression() lr.fit(clf.transform(X), y) pred = lr.predict(clf.transform(X_test)) accuracy = sum(pred == y_test)/y_test.size print 'Logistic Regression Accuracy: ' + str(accuracy)
def evaluate_model(model, X, y, labels, save_features=False, group_ablation=False, feature_output_name="features.csv", ticks=XTICKS): model_fs = RandomizedLogisticRegression(C=1, random_state=1) # Split into folds using labels # label_kfold = LabelKFold(labels, n_folds=10) group_kfold = GroupKFold(n_splits=10).split(X,y,groups=labels) folds = [] # For feature analysis feat_scores = [] # For ablation study # Group ablation study feature_groups = feature_sets.get_all_groups() ablated = {key: set() for key in feature_groups.keys()} roc_ab = {key: list() for key in feature_groups.keys()} roc_ab['true_roc_score'] = [] for train_index, test_index in group_kfold: print "processing fold: %d" % (len(folds) + 1) # Split X_train, X_test = X.values[train_index], X.values[test_index] y_train, y_test = y.values[train_index], y.values[test_index] scores = [] for k in XTICKS: indices = util.get_top_pearson_features(X_train, y_train, k) # Select k features X_train_fs = X_train[:, indices] X_test_fs = X_test[:, indices] model = model.fit(X_train_fs, y_train) # summarize the selection of the attributes yhat = model.predict(X_test_fs) # Predict scores.append(f1_score(y_test, yhat)) # Save if group_ablation: true_roc_score = roc_auc_score(y_test, yhat) roc_ab['true_roc_score'].append(true_roc_score) for group in feature_groups.keys(): # Get group features features = feature_groups[group] features_idx = util.get_column_index(features, X) # Get indices indices_ab = [i for i in indices if i not in features_idx] removed_indices = [i for i in indices if i in features_idx] # Filter X_train_ab = X_train[:, indices_ab] X_test_ab = X_test[:, indices_ab] # Fit model_ab = model.fit(X_train_ab, y_train) # Predict yhat_ab = model_ab.predict(X_test_ab) # Save ablated[group].update(X.columns[removed_indices]) roc_ab_score = roc_auc_score(y_test, yhat_ab) roc_ab[group].append(roc_ab_score - true_roc_score) # ----- save row ----- folds.append(scores) # ----- save row ----- # ----- save features ----- if save_features: model_fs = model_fs.fit(X_train, y_train) feat_scores.append(model_fs.scores_) # -------------------- if save_features: feat_scores = np.asarray(feat_scores) # convert to np array feat_scores = feat_scores.mean(axis=0) # squash # This command maps scores to features and sorts by score, with the feature name in the first position feat_scores = sorted(zip(X.columns, map(lambda x: round(x, 4), model_fs.scores_)), reverse=True, key=lambda x: x[1]) feat_scores = pd.DataFrame(feat_scores) csv_path = "output/feature_scores/" + feature_output_name feat_scores.to_csv(csv_path, index=False) util.print_full(feat_scores) if group_ablation: roc_ab = pd.DataFrame(roc_ab).mean() print "=======================" print "True AUC Score: %f" % roc_ab['true_roc_score'] print "=======================\n\n" for group in ablated.keys(): print "-----------------------" print "Group: %s " % group print "Removed: %s" % list(ablated[group]) print "Change in AUC: %f" % (roc_ab[group]) print "-----------------------\n" folds = np.asarray(folds) return folds
# 代码清单5-1 逻辑回归代码 import pandas as pd # 参数初始化 fileName = 'data/bankloan.xls' data = pd.read_excel(fileName) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() # 逻辑回归模型 from sklearn.linear_model import LogisticRegression as LR # 随机逻辑回归模型 from sklearn.linear_model import RandomizedLogisticRegression as RLR # 建立随机逻辑回归模型,筛选变量 rlr = RLR() # 训练模型 rlr.fit(x,y) # 获取特筛选结果,也可以通过.score_方法获取各个特征的分数 rlr.get_support() print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为 %s' %'.'.join(data.columns[rlr.get_support()])) # 筛选好特征 x = data[data.columns[rlr.get_support()]].as_matrix() # 建立逻辑回归模型 lr = LR() # 用筛选后的特征数据来训练模型 lr.fit(x,y) print(u'逻辑回归模型训练结束。') # 给出模型的平均正确率,本例为81.48 print(u'模型的平均正确率为 %s' %lr.score(x,y))
import pandas as pda fname = "C:/Users/Administrator/Desktop/data/luqu.xls" dataf = pda.read_excel(fname) #DataFrame.as_matrix: Convert the frame to its Numpy-array representation #DataFrame.iloc: Purely integer-location based indexing for selection by position x = dataf.iloc[:, 1:4].as_matrix() y = dataf.iloc[:, 0:1].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR r1 = RLR() r1.fit(x, y) eff = r1.get_support()#find the effective features, remove noneffective ones #print(dataf.columns[eff]) t = dataf[dataf.columns[r1.get_support()]].as_matrix() r2 = LR() r2.fit(t, y) print("training ends") print("accuracy: " + str(r2.score(x,y))) #score():Returns the mean accuracy on the given test data and labels
from sklearn.feature_selection import SelectFromModel from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import RandomizedLogisticRegression import fmriUtils as fm #自定义函数 n_folds = 10 f = fm.outTo() #输出重定向到文件 X,y = fm.loadData2() X2,y2 = fm.loadData2() y = fm.defineClass(y) randomized_logistic = RandomizedLogisticRegression(C=0.1,n_jobs=2) randomized_logistic.fit(X,y) XX = randomized_logistic.transform(X) print "============选择后剩余的特征================" print XX.shape yy = y cv = StratifiedKFold(yy,n_folds) cv_scores = [] for train, test in cv: svc = SVC(kernel='linear') svc.fit(XX[train], yy[train]) prediction = svc.predict(XX[test]) cv_scores.append( np.sum(prediction == yy[test]) / float(np.size(yy[test])) ) print "========分类准确率=======" print cv_scores,np.mean(cv_scores)
#-*- coding: utf-8 -*- #逻辑回归 自动建模 import pandas as pd #参数初始化 filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() #建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征 lr = LR() #建立逻辑货柜模型 lr.fit(x, y) #用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4%