def final_feats(df_data): x_train = df_data.iloc[:,1:370] #removing the "ID" and the "Target" columns """Getting the first 2 PCs""" pca = PCA(n_components=2) x_train_projected = pca.fit_transform(normalize(x_train, axis=0)) x_train, del_constants = remove_feat_constants(x_train) """ removing columns with no variance; in our case the all-zero columns""" x_train, del_identicals = remove_feat_identicals(x_train) """removing columns that are identical to each other, and retainining only one of them""" y_train = df_data["TARGET"] # Using L1 based feature selection on X_train with 308 columns lsvc = svm.LinearSVC(C=0.01, penalty="l1", dual=False).fit(x_train, y_train) model = SelectFromModel(lsvc, prefit=True) feat_ix_keep = model.get_support(indices=True) #getting indices of selected features #so that I don't have to use "transform" and convert the data frame to a matrix. orig_feat_ix = np.arange(x_train.columns.size) feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep) X_train_new = x_train.drop(labels=x_train.columns[feat_ix_delete], axis=1) X_train_new.insert(1, 'PCAOne', x_train_projected[:, 0]) X_train_new.insert(1, 'PCATwo', x_train_projected[:, 1]) return X_train_new, y_train, feat_ix_keep, pca, del_constants, del_identicals
def selecttest(): import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_boston from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LassoCV boston = load_boston() X,y = boston['data'], boston['target'] clf = LassoCV() sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X,y) n_features = sfm.transform(X).shape[1] while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] plt.title( "Features selected from Boston using SelectFromModel with " "threshold %0.3f." % sfm.threshold) feature1 = X_transform[:, 0] feature2 = X_transform[:, 1] plt.plot(feature1, feature2, 'r.') plt.xlabel("Feature number 1") plt.ylabel("Feature number 2") plt.ylim([np.min(feature2), np.max(feature2)]) plt.show()
def tree_based_selection(self, data_set, data_target, feature_names): """ :param data_set: :return: """ clf = ExtraTreesClassifier() clf = clf.fit(data_set, data_target) print clf.feature_importances_ model = SelectFromModel(clf, prefit=True) feature_set = model.transform(data_set) fea_index = [] for A_col in np.arange(data_set.shape[1]): for B_col in np.arange(feature_set.shape[1]): if (data_set[:, A_col] == feature_set[:, B_col]).all(): fea_index.append(A_col) check = {} for i in fea_index: check[feature_names[i]] = data_set[0][i] print np.array(check) return feature_set, fea_index
def select_features(data, neg_cmpd, pos_cmpd, compound_col="Metadata_compound", C=0.01): """ Return selected features basd on L1 linear svc. Parameters ----------- data : pandas DataFrame neg_cmpd : string name of negative control in compound_col pos_cmpd : string name of positive control in compound_col compound_col : string name of column in data that contains compound labels C : float (default=0.01) Sparsity, lower the number the fewer features are selected Returns ------- selected_features : list Selected features """ X, Y = _split_classes(data, neg_cmpd, pos_cmpd, compound_col) lin_svc = LinearSVC(C=C, penalty="l1", dual=False).fit(X, Y) model = SelectFromModel(lin_svc, prefit=True) feature_mask = np.array(model.get_support()) feature_names = np.array(X.columns.tolist()) selected_features = list(feature_names[feature_mask]) return selected_features
def lassoCV_regression(data,target,alphas): clf=LassoCV() sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(data, target) n_features = sfm.transform(data).shape[1] while n_features > 2: sfm.threshold += 0.1 data_transform = sfm.transform(data) n_features = data_transform.shape[1] rmses=[] kf=KFold(len(target),10,True,None) for train_index, test_index in kf: data_train,data_test=data_transform[train_index],data_transform[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) x0=np.arange(1,11) plt.figure() plt.plot(x0,rmses,label='LassoCV') plt.legend() plt.show() return rmses
def run(): allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = ["KWH", "KWHSPH", "KWHCOL", "KWHWTH", "KWHRFG", "KWHOTH", "BTUEL", "BTUELSPH", "BTUELCOL", "BTUELWTH", "BTUELRFG","BTUELOTH", "DOLLAREL", "DOLELSPH", "DOLELCOL", "DOLELWTH", "DOLELRFG", "DOLELOTH", "TOTALBTUOTH", "TOTALBTUCOL", 'TOTALBTU', 'TOTALBTUWTH', 'TOTALBTU', 'TOTALBTUSPH', 'TOTALBTURFG', 'TOTALDOL', 'TOTALDOLSPH', 'TOTALDOLCOL', 'TOTALDOLWTH', 'TOTALDOLRFG', 'TOTALDOLOTH']) #allKeys, X, y = loadData("../../data/household_electricity_usage/recs2009_public.csv", label = "BTUEL", otherRemove = [], forceUse = # [ # 'WGTP', 'NP', 'TYPE', 'ACR', 'BDSP', 'BATH', 'FS','MHP', 'RMSP', 'RNTP', 'REFR', 'RNTP', 'RWAT', 'STOV', 'TEN', 'VALP', 'YBL', 'FES', 'FINCP', 'HINCP', 'HHT', 'KIT', 'NOC', 'NPF', 'PLM', 'SRNT', 'SVAL', 'TAXP', 'WIF', 'WORKSTAT', # ]) clf = RandomForestRegressor(n_estimators = 100, n_jobs = 7) clf.fit(X, y) model = SelectFromModel(clf, prefit = True) X = model.transform(X) relevantFeatures = [allKeys[i] for i in range(len(model._get_support_mask())) if model._get_support_mask()[i] == True] print("Relevant Features", relevantFeatures) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) clf.fit(X_train, y_train) print(y_test[:100]) print(metrics.mean_squared_error(clf.predict(X_test), y_test)) features = sorted(zip(allKeys, clf.feature_importances_), key = lambda x : x[1], reverse = True) print("Features", features)
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')): ''' methods = ('variance', 'correlation', 'l1', 'forest') - variance: use variance threshold to discard features that are mostly 0 or 1 - correlation: use chi2 test to remove most very correlated features - l1: use l1 penalty to remove features that make solution sparse - forest: use ExtraTreesClassifier to point out importance of features select important ones ''' features = x.loc[:,'Feature_1':'Feature_2'] if 'variance' in methods: vt = VT(threshold=(0.99*(1-0.99))) vt.fit(features) if 'correlation' in methods: cr = SP(f_regression, percentile=80) if 'l1' in methods: rgr = MultiTaskLassoCV(cv=5, n_jobs=-1) m = SFM(rgr) if 'forest' in methods: clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y) m = SFM(clf) m.fit(x.values, y.values) for indices in idx_list: x_indices = x_indices & indices print 'All: %s' % len(x_indices) return list(x_indices)
def feature_selction(_train_data, _valid_data, _test_data, _train_label, _valid_label, _test_label): train_imageNo = _train_data.shape[0] valid_imageNo = _valid_data.shape[0] whole_data = numpy.concatenate((_train_data, _valid_data, _test_data)) whole_data = whole_data.reshape((-1, 120)) whole_label = numpy.concatenate((_train_label, _valid_label, _test_label)) whole_label = list(whole_label) new_label_list = list() for i in whole_label: for j in range(100): new_label_list.append(i) assert len(new_label_list) == whole_data.shape[0] lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(whole_data, new_label_list) model = SelectFromModel(lsvc, prefit=True) data_new = model.transform(whole_data) print ('After feature selection we have', data_new.shape[1], 'features.') data_new = data_new.reshape((-1, 100, data_new.shape[1])) _train_data = data_new[:train_imageNo,:,:] _valid_data = data_new[train_imageNo:train_imageNo+valid_imageNo,:,:] _test_data = data_new[train_imageNo+valid_imageNo:,:,:] return _train_data, _valid_data, _test_data
def feature_importance_with_forest(rforest_classifier, issues_train, priority_train, issues_test, priority_test): """ Assess feature importance using a Random Forest. :param rforest_classifier: An already fitted classifier. :param issues_train: Train features. :param priority_train: Train classes. :param issues_test: Test features. :param priority_test: Test classes. :return: None """ importances = rforest_classifier.feature_importances_ indices = np.argsort(importances)[::-1] for column_index in range(len(issues_train.columns)): print column_index + 1, ") ", issues_train.columns[column_index], " ", importances[indices[column_index]] figure, axes = plt.subplots(1, 1) plt.title('Feature importance') plt.bar(range(len(issues_train.columns)), importances[indices], color='lightblue', align='center') plt.xticks(range(len(issues_train.columns)), issues_train.columns, rotation=90) plt.xlim([-1, len(issues_train.columns)]) plt.tight_layout() plt.show() evaluate_performance("FOREST", rforest_classifier, issues_train, priority_train, issues_test, priority_test) print "Selecting important features ..." select = SelectFromModel(rforest_classifier, threshold=0.05, prefit=True) train_selected = select.transform(issues_train) test_selected = select.transform(issues_test) rforest_classifier.fit(train_selected, priority_train) evaluate_performance("FOREST-IMPORTANT", rforest_classifier, train_selected, priority_train, test_selected, priority_test)
def forests(input_df, target_df): """This method implements two types of forest features selection. ExtraTreesClassifier & RandomForestClassifier. Features are ranked in order of importance.""" from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel clf = ExtraTreesClassifier(random_state = 0) clf = clf.fit(input_df, target_df) model = SelectFromModel(clf, prefit=True) input_df_new = model.transform(input_df) original_space = input_df.shape new_space_ETC = input_df_new.shape tuple_holder = [(j, i) for i, j in zip(feature_space, clf.feature_importances_)] tuple_holder.sort() tuple_holder.reverse() ################################################ ################################################ from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state = 0) clf = clf.fit(input_df, target_df) model = SelectFromModel(clf, prefit=True) input_df_new = model.transform(input_df) new_space_RFC = input_df_new.shape tuple_holder_2 = [(j, i) for i, j in zip(feature_space, clf.feature_importances_)] tuple_holder_2.sort() tuple_holder_2.reverse() ################################################ ################################################ rank_number = 0 print 'ExtraTreesClassifier', '\t'*4, 'RandomForestClassifier' print 'Old Space: ', original_space, '\t'*4, 'Old Space:', original_space print 'New Space: ', new_space_ETC, '\t'*4, 'New Space:', new_space_RFC for i, j in zip(tuple_holder, tuple_holder_2): rank_number += 1 print rank_number, '|', i, '\t'*3, rank_number, '|', j
def test_max_features_dim(max_features): clf = RandomForestClassifier(n_estimators=50, random_state=0) transformer = SelectFromModel(estimator=clf, max_features=max_features, threshold=-np.inf) X_trans = transformer.fit_transform(data, y) assert X_trans.shape[1] == max_features
def test_invalid_input(): clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=None, tol=None) for threshold in ["gobbledigook", ".5 * gobbledigook"]: model = SelectFromModel(clf, threshold=threshold) model.fit(data, y) assert_raises(ValueError, model.transform, data)
def lasso_reducer(X, y): clf = LassoCV() # Set a minimum threshold of 0.25 # this is a 'maxing out' of the sum of all coefficients sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(X, y) n_features = sfm.transform(X).shape[1] # reset the threshold until the number of features equals two. # Note that the attribute can be set directly instead of repeatedley # fitting the metatransformer. while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] # Plot the seelcted two features from X. plt.title('features selected from boston using the SelectFromModel with' 'threshold of %0.3f.' % sfm.threshold) feature1 = X_transform[:, 0] feature2 = X_transform[:, 1] plt.plot(feature1, feature2, 'r.') plt.xlabel("Value of Feature number 1") plt.ylabel("Value of Feature number 2") plt.ylim([np.min(feature2), np.max(feature2)]) plt.show() return
def test_feature_importances_2d_coef(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, n_classes=4, ) est = LogisticRegression() for threshold, func in zip(["mean", "median"], [np.mean, np.median]): for order in [1, 2, np.inf]: # Fit SelectFromModel a multi-class problem transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, "coef_")) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) # Manually check that the norm is correctly performed est.fit(X, y) importances = norm(est.coef_, axis=0, ord=order) feature_mask = importances > func(importances) assert_array_equal(X_new, X[:, feature_mask])
class SelectFromModelSelection(SelectionModel): name = "SelectFromModel" def __init__(self, *args): SelectionModel.__init__(self, *args) self.selector = SelectFromModel(self.estimator) self.selector.fit(self.x_array, self.y_array) self.support_ = self.selector.get_support()
def test_warm_start(): est = PassiveAggressiveClassifier(warm_start=True, random_state=0) transformer = SelectFromModel(estimator=est) transformer.fit(data, y) old_model = transformer.estimator_ transformer.fit(data, y) new_model = transformer.estimator_ assert_true(old_model is new_model)
def rf_feat_reduction(rf_model, features): print " Reducing number of input features based on feature importance." subset_model = SelectFromModel(rf_model, prefit=True) feat_subset = subset_model.transform(features) feat_bool = subset_model.get_support() print " " + str(len(feat_subset[0])) + " features chosen after model selection." return feat_subset, feat_bool
def test_max_features_error(max_features, err_type, err_msg): clf = RandomForestClassifier(n_estimators=50, random_state=0) transformer = SelectFromModel(estimator=clf, max_features=max_features, threshold=-np.inf) with pytest.raises(err_type, match=err_msg): transformer.fit(data, y)
def test_input_estimator_unchanged(): """ Test that SelectFromModel fits on a clone of the estimator. """ est = RandomForestClassifier() transformer = SelectFromModel(estimator=est) transformer.fit(data, y) assert_true(transformer.estimator is est)
def selectfeature(x, y, x_pre): x, x_pre = datscater(x, x_pre) clf = linear_model.LassoLars().fit(x, y) model = SelectFromModel(clf, prefit=True) x_new = model.transform(x) print 'x',x.shape print x_new.shape x_pre = model.transform(x_pre) return x_new, x_pre
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th): # SelectfromModel clf = LGBMClassifier(n_estimators=400) clf.fit(matrix_x_temp, label_y) sfm = SelectFromModel(clf, prefit=True, threshold=th) matrix_x = sfm.transform(matrix_x_temp) # 打印出有多少特征重要性非零的特征 feature_score_dict = {} for fn, s in zip(fe_name, clf.feature_importances_): feature_score_dict[fn] = s m = 0 for k in feature_score_dict: if feature_score_dict[k] == 0.0: m += 1 print 'number of not-zero features:' + str(len(feature_score_dict) - m) # 打印出特征重要性 feature_score_dict_sorted = sorted(feature_score_dict.items(), key=lambda d: d[1], reverse=True) print 'feature_importance:' for ii in range(len(feature_score_dict_sorted)): print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] print '\n' f = open('../eda/lgb_feature_importance.txt', 'w') f.write(th) f.write('\nRank\tFeature Name\tFeature Importance\n') for i in range(len(feature_score_dict_sorted)): f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') f.close() # 打印具体使用了哪些字段 how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 feature_used_dict_temp = feature_score_dict_sorted[:how_long] feature_used_name = [] for ii in range(len(feature_used_dict_temp)): feature_used_name.append(feature_used_dict_temp[ii][0]) print 'feature_chooesed:' for ii in range(len(feature_used_name)): print feature_used_name[ii] print '\n' f = open('../eda/lgb_feature_chose.txt', 'w') f.write('Feature Chose Name :\n') for i in range(len(feature_used_name)): f.write(str(feature_used_name[i]) + '\n') f.close() # 找到未被使用的字段名 feature_not_used_name = [] for i in range(len(fe_name)): if fe_name[i] not in feature_used_name: feature_not_used_name.append(fe_name[i]) return matrix_x, feature_not_used_name[:], len(feature_used_name)
def select_feature(clf,x_train,x_valid): clf.fit(x_train, y_train) model = SelectFromModel(clf, prefit=True, threshold="mean") print x_train.shape x_train = model.transform(x_train) x_valid = model.transform(x_valid) print x_train.shape return x_train,x_valid
def train(self): rfc = RandomForestRegressor() rfc.fit(self.data, self.target) model = SelectFromModel(rfc, prefit=True) X = model.transform(self.data) self.predict = model.transform(self.predict) rfc.fit(X, self.target) return rfc
def select_features_tree(X, y, feature_names = []): print X.shape #forest = RandomForestClassifier(n_estimators=1000, n_jobs=4) forest = ExtraTreesClassifier(n_estimators=1000, n_jobs=8) fo = forest.fit(X, y) sorted_feature_names = plot_feature_importance(fo, X, feature_names) model = SelectFromModel(fo, prefit=True, ) X_new = model.transform(X) print X_new.shape return X_new, sorted_feature_names[0:X_new.shape[1]]
def test_coef_default_threshold(): X, y = datasets.make_classification( n_samples=100, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # For the Lasso and related models, the threshold defaults to 1e-5 transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) transformer.fit(X, y) X_new = transformer.transform(X) mask = np.abs(transformer.estimator_.coef_) > 1e-5 assert_array_almost_equal(X_new, X[:, mask])
def test_threshold_string(): est = RandomForestClassifier(n_estimators=50, random_state=0) model = SelectFromModel(est, threshold="0.5*mean") model.fit(data, y) X_transform = model.transform(data) # Calculate the threshold from the estimator directly. est.fit(data, y) threshold = 0.5 * np.mean(est.feature_importances_) mask = est.feature_importances_ > threshold assert_array_equal(X_transform, data[:, mask])
def select_features(inputs, label, threshold): print 'training ExtraTreesClassifier...' clf = ExtraTreesClassifier(criterion='entropy') clf.fit(inputs, label) threshold='%f*mean'%(threshold) print 'training SelectFromModel, threshold=%s...'%(threshold) sfm = SelectFromModel(clf, threshold=threshold, prefit=True) inputs_new = sfm.transform(inputs) #pdb.set_trace() print inputs_new.shape return sfm, inputs_new
def extra_trees_classifier(): tianic=Titanic_Data('../input/train.csv','../input/test.csv') combined_normalized_data=tianic.get_normalized_data() train,test,targets = recover_train_test_target('../input/train.csv', combined_normalized_data) clf = ExtraTreesClassifier(n_estimators=200) clf = clf.fit(train, targets) features = pd.DataFrame() features['feature'] = train.columns features['importance'] = clf.feature_importances_ features.sort(['importance'],ascending=False) model = SelectFromModel(clf, prefit=True) train_new = model.transform(train) train_new.shape test_new = model.transform(test) test_new.shape forest = RandomForestClassifier(max_features='sqrt') parameter_grid = { 'max_depth' : [4,5,6,7,8], 'n_estimators': [200,210,240,250], 'criterion': ['gini','entropy'] } cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(forest, param_grid=parameter_grid, cv=cross_validation) grid_search.fit(train_new, targets) print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) output = grid_search.predict(test_new).astype(int) df_output = pd.DataFrame() df_output['PassengerId'] = test['PassengerId'] df_output['Survived'] = output df_output[['PassengerId','Survived']].to_csv('./extra_trees_classifier_output.csv',index=False)
def test_partial_fit(): est = PassiveAggressiveClassifier(random_state=0, shuffle=False) transformer = SelectFromModel(estimator=est) transformer.partial_fit(data, y, classes=np.unique(y)) old_model = transformer.estimator_ transformer.partial_fit(data, y, classes=np.unique(y)) new_model = transformer.estimator_ assert_true(old_model is new_model) X_transform = transformer.transform(data) transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) assert_array_equal(X_transform, transformer.transform(data))
def execute(fdata): data = list() target = list() storeDict = dict() for i, lines in enumerate(fdata): sline = lines.split(",") target.append(int(sline[0])) data.append([float(x) for j, x in enumerate(sline) if j != 0]) storeDict[i] = [float(x) for j, x in enumerate(sline) if j != 0] data = np.array(data) X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.25, random_state=0) clf = ExtraTreesClassifier() clf = clf.fit(X_train, y_train) model = SelectFromModel(clf, prefit=True) X_new = model.transform(X_train) clfNew = svm.SVC(kernel='linear', C=1).fit(X_new, y_train) value_feature = list() countDict = dict() for key, val in storeDict.items(): countDict[key] = 0 for i, inval in enumerate(val): if inval in X_new[0]: countDict[key] = countDict[key] + 1 keyName = max(countDict, key=countDict.get) posStore = list() for val in X_new[0]: posStore.append(storeDict[keyName].index(val)) X_test_new = list() for val in X_test: inlist = list() for i, inval in enumerate(val): if i in posStore: inlist.append(inval) X_test_new.append(inlist) X_test_new = np.array(X_test_new) return accuracy_score(y_test, clf.predict(X_test)), accuracy_score(y_test, clfNew.predict(X_test_new))
model_filter = SelectKBest(f_classif, k=10) lr = LogisticRegression(max_iter=100, class_weight=None) model_pl = Pipeline([('SelectKBest', model_filter), ('LogisticRegression', lr)]) model_pl.fit(x_train, y_train) model_pl.predict(x_test) print model_pl.score(x_test, y_test) print model_pl.named_steps['SelectKBest'].get_support() print dataset.atribNombre([ i for i, x in enumerate( model_pl.named_steps['SelectKBest'].get_support().tolist()) if x ]) lr = LogisticRegression(max_iter=100, class_weight=None) lrS = LogisticRegression(max_iter=100, class_weight=None).fit(x_train, y_train) model_filter = SelectFromModel(lrS) print '\nSelectFromModel' model_pl = Pipeline([('SelectFromModel', model_filter), ('LogisticRegression', lr)]) model_pl.fit(x_train, y_train) model_pl.predict(x_test) print model_pl.score(x_test, y_test) print model_pl.named_steps['SelectFromModel'].get_support() print dataset.atribNombre([ i for i, x in enumerate( model_pl.named_steps['SelectFromModel'].get_support().tolist()) if x ])
def logistic_dimension(data, label, parameter=1): logistic_ = LogisticRegression(penalty="l1", C=parameter, max_iter=30) model = SelectFromModel(logistic_) new_data = model.fit_transform(data, label) mask = model.get_support(indices=True) return new_data, mask
fraudInstanceData = pd.read_csv("FraudInstanceData.csv", header=0, index_col=0) maritalStatuses = pd.get_dummies(fraudInstanceData["Marital Status"]) accomodationTypes = pd.get_dummies(fraudInstanceData["Accomodation Type"]) fraudInstanceData = fraudInstanceData.drop('Marital Status', axis=1) fraudInstanceData = fraudInstanceData.drop('Accomodation Type', axis=1) fraudInstanceData = fraudInstanceData.join(maritalStatuses) fraudInstanceData = fraudInstanceData.join(accomodationTypes) currencyToMoney = lambda c: Decimal(sub(r'[^\d.]', '', c)) fraudInstanceData['Claim Amount'] = fraudInstanceData["Claim Amount"].apply( currencyToMoney) y = fraudInstanceData.iloc[:, 1] X = fraudInstanceData.iloc[:, 1:] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23) pipeline = Pipeline([('feature_selection', SelectFromModel(LogisticRegression(penalty="l1"))), ('regression', LogisticRegression())]) grid_cv = GridSearchCV(pipeline, {}, cv=10) grid_cv.fit(X_train, y_train) selected_feature = grid_cv.transform(X_train.co) y_pred = grid_cv.predict(X_test) print(grid_cv.score(X_test, y_pred)) print(confusion_matrix(y_test, y_pred))
def build_model(self, fname_structlib): #MODIFIED BY JIM (this way don't have to remember to close the file...) with open(fname_structlib, 'rb') as f_structlib: structs = pickle.load(f_structlib) n_structs = 0 for struct in structs: if not struct.metricpredicted: n_structs += 1 metrics = np.zeros(n_structs) n_features = 0 for prop in self.properties: if prop.useful: n_features += 1 features = np.zeros((n_structs, n_features)) count_structs = 0 for struct in structs: if not struct.metricpredicted: props = self.calc_properties(struct) count_features = 0 for prop in self.properties: # make sure this happens in the same order each time if prop.useful: #Need to prune properties we don't need (i.e. smaller rdf, etc.) try: features[count_structs, count_features] = props[prop.label] count_features += 1 except KeyError: #Remove this property so don't have to do this again prop.useful = False metrics[count_structs] = struct.metric count_structs += 1 # cross-validation etc. etc. and change property.useful's # need to make sure that property.useful status is consistent with the model (has same number of features) # make new model to test with test_model = clone(self.model) test_scaler = clone(self.scaler) # split data into testing and training sets features_train, features_test, metrics_train, metrics_test = train_test_split( features, metrics, test_size=0.25, shuffle=True) # using training set, perform feature selection by selecting from fitted LASSO model features_train_scaled = test_scaler.fit_transform(features_train) features_test_scaled = test_scaler.transform(features_test) selector = SelectFromModel(test_model, threshold=1e-4) # HARD CODED NUMBER HERE selector.fit(features_train_scaled, metrics_train) print('number of features selected', np.sum(selector.get_support().astype(int))) features_train_reduced_unscaled = selector.transform(features_train) features_test_reduced_unscaled = selector.transform(features_test) # using training set, perform recursive feature elimination with cross-validation # selector = RFECV(test_model, step=1, scoring='neg_mean_squared_error') # features_train_new = selector.fit_transform(features_train, metrics_train) # print('number of features selected after cross-validation', selector.n_features_) # features_test_new = selector.transform(features_test) # features_new = selector.transform(features) # fit with reduced number of features features_train_reduced_scaled = test_scaler.fit_transform( features_train_reduced_unscaled) features_test_reduced_scaled = test_scaler.transform( features_test_reduced_unscaled) test_model.fit(features_train_reduced_scaled, metrics_train) # compute RMSE of test set # should also compute for training set?? mse_test = mean_squared_error( metrics_test, test_model.predict(features_test_reduced_scaled)) # Below switching to using coefficient of determination, not RMSE, but still calling it RMSE # This normalizes things to the variance in the data, so now want to be bigger and close to 1 # A good cutoff is probably 0.8 or 0.9 #rmse_norm_new = np.sqrt(mse_test)/np.mean(metrics) rmse_norm_new = (np.var(metrics) - mse_test) / np.var(metrics) print('rmse_norm_new', rmse_norm_new) print('self.rmse_norm', self.rmse_norm) #if rmse_norm_new < self.rmse_norm: # should we do something fancier than this? # copy model (or should we maybe refit it to all the data?? not sure if this would violate something machine learning) self.scaler = clone(test_scaler) features_train_reduced_scaled = self.scaler.fit_transform( features_train_reduced_unscaled) self.model = clone(test_model) self.model.fit(features_train_reduced_scaled, metrics_train) # change useful labels on properties count_features = 0 selector_support = selector.get_support() for prop in self.properties: if prop.useful: prop.useful = selector_support[count_features] count_features += 1 if rmse_norm_new > self.rmse_norm: # should we do something fancier than this? self.rmse_norm = rmse_norm_new return True else: return False
forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Extented Slice -> reverse array print('Features ranked:') for f in range(X_train.shape[1]): print(f'{f+1}) {feat_labels[indices[f]]:<30} {importances[indices[f]]}') import matplotlib.pyplot as plt plt.title('Feature Importance') plt.bar(range(X_train.shape[1]), importances[indices], align='center') plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90) plt.xlim([-1, X_train.shape[1]]) plt.tight_layout() plt.show() # Note: highly correlated features may not all be ranked high from sklearn.feature_selection import SelectFromModel sfm = SelectFromModel(forest, threshold=0.1, prefit=True) X_selected = sfm.transform(X_train) print('Number of features that meet this threshold: {X_selected.shape[1]}') for f in range(X_selected.shape[1]): print(f'{f+1}) {feat_labels[indices[f]]:<30} {importances[indices[f]]:3f}')
dataframe['label'].value_counts() x_matrix = dataframe.copy() x_matrix.drop(['label'], axis=1, inplace=True) y_vector = dataframe['label'] sc = StandardScaler() sc.fit(x_matrix) x_matrix = sc.transform(x_matrix) ############feature selection classif = ExtraTreesClassifier(n_estimators=100) classif = classif.fit(x_matrix, y_vector) classif.feature_importances_ selected = SelectFromModel(classif, prefit=True) x_matrix_new = selected.transform(x_matrix) x_matrix_new.shape X_train, X_test, y_train, y_test = train_test_split(x_matrix_new, y_vector, test_size=0.33, random_state=42,shuffle=True) ############# classif_LR = LogisticRegression() classif_KN = KNeighborsClassifier() classif_RF = RandomForestClassifier() ###############Log
print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) #feature selection threshold - deprecated X.shape X_selected = clf.transform(X, threshold=0.02) X_selected.shape #SelectModel method from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel X.shape lsvc = LinearSVC(C=0.1, penalty="l1", dual=False).fit(X, y) model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(X) X_new.shape yhat = clf.predict_proba(X) #which is which? clf.classes_ yhat.shape X.shape yhat[:,0].shape X0['yhats_A'] = yhat[:,0] X0['yhats_D'] = yhat[:,1] X0['yhats_H'] = yhat[:,2]
class LinearSVM(SemevalModel): def __init__(self): SemevalModel.__init__(self) def __transform__(self, q1, q2): if type(q1) == list: q1 = ' '.join(q1) if type(q2) == list: q2 = ' '.join(q2) lcs = features.lcs(re.split('(\W)', q1), re.split('(\W)', q2)) lcs1 = len(lcs[1].split()) lcs2 = lcs[0] lcsub = features.lcsub(q1, q2)[0] jaccard = features.jaccard(q1, q2) containment_similarity = features.containment_similarities(q1, q2) # greedy_tiling = features.greedy_string_tiling(q1, q2) X = [lcs1, lcsub, jaccard, containment_similarity] # ngram features for n in range(2, 5): ngram1 = ' ' for gram in nltk.ngrams(q1.split(), n): ngram1 += 'x'.join(gram) + ' ' ngram2 = ' ' for gram in nltk.ngrams(q2.split(), n): ngram2 += 'x'.join(gram) + ' ' lcs = features.lcs(re.split('(\W)', ngram1), re.split('(\W)', ngram2)) X.append(len(lcs[1].split())) # X.append(lcs[0]) X.append(features.lcsub(ngram1, ngram2)[0]) X.append(features.jaccard(ngram1, ngram2)) X.append(features.containment_similarities(ngram1, ngram2)) return X def get_features(self, q1id, q1, q2id, q2, set='train'): X = [] if set == 'train': q1_elmo = self.trainelmo.get(str(self.trainidx[q1id])) q2_elmo = self.trainelmo.get(str(self.trainidx[q2id])) else: q1_elmo = self.develmo.get(str(self.devidx[q1id])) q2_elmo = self.develmo.get(str(self.devidx[q2id])) q1_w2v = features.encode(q1, self.word2vec) q1_elmo_bottom = [ np.concatenate([q1_w2v[i], q1_elmo[0][i]]) for i in range(len(q1_w2v)) ] q1_elmo_middle = [ np.concatenate([q1_w2v[i], q1_elmo[1][i]]) for i in range(len(q1_w2v)) ] q1_elmo_top = [ np.concatenate([q1_w2v[i], q1_elmo[2][i]]) for i in range(len(q1_w2v)) ] q2_w2v = features.encode(q2, self.word2vec) q2_elmo_bottom = [ np.concatenate([q2_w2v[i], q2_elmo[0][i]]) for i in range(len(q2_w2v)) ] q2_elmo_middle = [ np.concatenate([q2_w2v[i], q2_elmo[1][i]]) for i in range(len(q2_w2v)) ] q2_elmo_top = [ np.concatenate([q2_w2v[i], q2_elmo[2][i]]) for i in range(len(q2_w2v)) ] # X.append(self.simbow.score(q1, q1_w2v, q2, q2_w2v)) X.append(self.simbow.score(q1, q1_elmo_bottom, q2, q2_elmo_bottom)) X.append(self.simbow.score(q1, q1_elmo_middle, q2, q2_elmo_middle)) X.append(self.simbow.score(q1, q1_elmo_top, q2, q2_elmo_top)) return X def train(self): logging.info('Training svm.', extra=d) treekernel = features.TreeKernel(alpha=0, decay=1, ignore_leaves=True, smoothed=False) self.bm25_model, self.avg_idf, self.bm25_qid_index = features.init_bm25( traindata=self.trainset, devdata=self.devset, testdata=[]) if not os.path.exists(FEATURE_PATH): X, y = [], [] for i, query_question in enumerate(self.traindata): percentage = round(float(i + 1) / len(self.traindata), 2) print('Preparing traindata: ', percentage, i + 1, sep='\t', end='\r') q1id = query_question['q1_id'] q2id = query_question['q2_id'] q1, q2 = query_question['q1'], query_question['q2'] # x = self.get_features(q1id, q1, q2id, q2) x = [] # x = self.__transform__(q1, q2) # # # elmo and word2vec embeddings q1_elmo = self.trainelmo.get(str(self.trainidx[q1id])) q1_w2v = features.encode(q1, self.word2vec) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] q2_elmo = self.trainelmo.get(str(self.trainidx[q2id])) q2_w2v = features.encode(q2, self.word2vec) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] # # translation # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb) # x.append(trlmprob) # # # bm25 # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf) # x.append(bm25_score) # # # cosine # q1_lemma = query_question['q1_lemmas'] # q1_pos = query_question['q1_pos'] # q2_lemma = query_question['q2_lemmas'] # q2_pos = query_question['q2_pos'] # for n in range(1,5): # try: # x.append(features.cosine(' '.join(q1), ' '.join(q2), n=n)) # except: # x.append(0.0) # try: # x.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n)) # except: # x.append(0.0) # try: # x.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n)) # except: # x.append(0.0) # # # tree kernels # q1_token2lemma = dict(zip(query_question['q1_full'], query_question['q1_lemmas'])) # q2_token2lemma = dict(zip(query_question['q2_full'], query_question['q2_lemmas'])) # q1_tree, q2_tree = utils.parse_tree(query_question['q1_tree'], q1_token2lemma), utils.parse_tree(query_question['q2_tree'], q2_token2lemma) # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree) # x.append(treekernel(q1_tree, q2_tree)) # # # frobenius norm # x.append(features.frobenius_norm(q1_emb, q2_emb)) # # # softcosine simbow = self.simbow.score(q1, q1_emb, q2, q2_emb) x.append(simbow) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] simbow_q1q3, simbow_q2q3 = 0, 0 if len(q3) > 0: # x.extend(self.get_features(q1id, q1, q3id, q3)) q3_elmo = self.trainelmo.get(str(self.trainidx[q3id])) q3_w2v = features.encode(q3, self.word2vec) q3_emb = [ np.concatenate([q3_w2v[i], q3_elmo[i]]) for i in range(len(q3_w2v)) ] simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb) # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb) # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb) # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf) # x.append(trlmprob) # x.append(bm25_score) x.append(simbow_q1q3) # x.append(simbow_q2q3) X.append(x) y.append(query_question['label']) p.dump(list(zip(X, y)), open(FEATURE_PATH, 'wb')) else: f = p.load(open(FEATURE_PATH, 'rb')) X = list(map(lambda x: x[0], f)) y = list(map(lambda x: x[1], f)) # scale features self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(X) X = self.scaler.transform(X) clf = LassoCV(cv=10) self.feat_selector = SelectFromModel(clf) self.feat_selector.fit(X, y) X = self.feat_selector.transform(X) self.model = self.train_svm(trainvectors=X, labels=y, c='search', kernel='search', gamma='search', degree='search', jobs=4) # self.model = self.train_regression(trainvectors=X, labels=y, c='search', penalty='search', tol='search') logging.info('Finishing to train svm.') def validate(self): logging.info('Validating svm.', extra=d) treekernel = features.TreeKernel(alpha=0, decay=1, ignore_leaves=True, smoothed=False) ranking = {} y_real, y_pred = [], [] for i, q1id in enumerate(self.devset): ranking[q1id] = [] percentage = round(float(i + 1) / len(self.devset), 2) print('Progress: ', percentage, i + 1, sep='\t', end='\r') query = self.devset[q1id] q1 = query['tokens_proc'] # q1_lemma = query['lemmas'] # q1_pos = query['pos'] # q1_token2lemma = dict(zip(query['tokens'], query['lemmas'])) # q1_tree = utils.parse_tree(query['subj_tree'], q1_token2lemma) q1_elmo = self.develmo.get(str(self.devidx[q1id])) q1_w2v = features.encode(q1, self.word2vec) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] duplicates = query['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] q2id = rel_question['id'] q2 = rel_question['tokens_proc'] # X = self.get_features(q1id, q1, q2id, q2, set='dev') # X = self.__transform__(q1, q2) X = [] q2_elmo = self.develmo.get(str(self.devidx[q2id])) q2_w2v = features.encode(q2, self.word2vec) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] # # translation # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb) # X.append(trlmprob) # # # bm25 # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf) # X.append(bm25_score) # # # cosine # q2_lemma = rel_question['lemmas'] # q2_pos = rel_question['pos'] # for n in range(1,5): # try: # X.append(features.cosine(' '.join(q1), ' '.join(q2), n=n)) # except: # X.append(0.0) # try: # X.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n)) # except: # X.append(0.0) # try: # X.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n)) # except: # X.append(0.0) # # # tree kernel # q2_token2lemma = dict(zip(rel_question['tokens'], rel_question['lemmas'])) # q2_tree = utils.parse_tree(rel_question['subj_tree'], q2_token2lemma) # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree) # X.append(treekernel(q1_tree, q2_tree)) # # # frobenius norm # X.append(features.frobenius_norm(q1_emb, q2_emb)) # softcosine simbow = self.simbow.score(q1, q1_emb, q2, q2_emb) X.append(simbow) for comment in duplicate['rel_comments']: q3id = comment['id'] q3 = comment['tokens_proc'] simbow_q1q3, simbow_q2q3 = 0, 0 if len(q3) > 0: # X.extend(self.get_features(q1id, q1, q3id, q3, set='dev')) q3_elmo = self.develmo.get( str(self.devidx[comment['id']])) q3_w2v = features.encode(q3, self.word2vec) q3_emb = [ np.concatenate([q3_w2v[i], q3_elmo[i]]) for i in range(len(q3_w2v)) ] simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb) # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb) # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf) # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb) # X.append(trlmprob) # X.append(bm25_score) X.append(simbow_q1q3) # X.append(simbow_q2q3) # scale X = self.scaler.transform([X]) # feature selection X = self.feat_selector.transform(X) score = self.model.decision_function(X)[0] pred_label = self.model.predict(X)[0] y_pred.append(pred_label) real_label = 0 if rel_question['relevance'] != 'Irrelevant': real_label = 1 y_real.append(real_label) ranking[q1id].append((real_label, score, q2id)) with open('data/ranking.txt', 'w') as f: for q1id in ranking: for row in ranking[q1id]: label = 'false' if row[0] == 1: label = 'true' f.write('\t'.join([ str(q1id), str(row[2]), str(0), str(row[1]), label, '\n' ])) logging.info('Finishing to validate svm.', extra=d) return ranking, y_real, y_pred
step=.1, cv=5, scoring='roc_auc') for i in range(0, len(skpipes)): skpipes[i].append(('rfe_rf' + str(i), cv_rfc)) if fs_type == 2: #Wrapper Select via model clf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=3, criterion='entropy', random_state=None) sel = SelectFromModel( clf, prefit=False, threshold='mean', max_features=None ) #to select only based on max_features, set to integer value and set threshold=-np.inf for i in range(0, len(skpipes)): skpipes[i].append(('wrapper_rf' + str(i), sel)) if fs_type == 3: ######Only work if the Target is binned########### #Univariate Feature Selection - Chi-squared #will throw error if any negative values in features, so turn off feature normalization, or switch to mutual_info_classif print('Univariate Feature Selection - Chi2: ') sel = SelectKBest(chi2, k=k_cnt) for i in range(0, len(skpipes)): skpipes[i].append(('ufs' + str(i), sel)) # %%
def train(self): logging.info('Training svm.', extra=d) treekernel = features.TreeKernel(alpha=0, decay=1, ignore_leaves=True, smoothed=False) self.bm25_model, self.avg_idf, self.bm25_qid_index = features.init_bm25( traindata=self.trainset, devdata=self.devset, testdata=[]) if not os.path.exists(FEATURE_PATH): X, y = [], [] for i, query_question in enumerate(self.traindata): percentage = round(float(i + 1) / len(self.traindata), 2) print('Preparing traindata: ', percentage, i + 1, sep='\t', end='\r') q1id = query_question['q1_id'] q2id = query_question['q2_id'] q1, q2 = query_question['q1'], query_question['q2'] # x = self.get_features(q1id, q1, q2id, q2) x = [] # x = self.__transform__(q1, q2) # # # elmo and word2vec embeddings q1_elmo = self.trainelmo.get(str(self.trainidx[q1id])) q1_w2v = features.encode(q1, self.word2vec) q1_emb = [ np.concatenate([q1_w2v[i], q1_elmo[i]]) for i in range(len(q1_w2v)) ] q2_elmo = self.trainelmo.get(str(self.trainidx[q2id])) q2_w2v = features.encode(q2, self.word2vec) q2_emb = [ np.concatenate([q2_w2v[i], q2_elmo[i]]) for i in range(len(q2_w2v)) ] # # translation # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q2, q2_emb) # x.append(trlmprob) # # # bm25 # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[q2id], self.avg_idf) # x.append(bm25_score) # # # cosine # q1_lemma = query_question['q1_lemmas'] # q1_pos = query_question['q1_pos'] # q2_lemma = query_question['q2_lemmas'] # q2_pos = query_question['q2_pos'] # for n in range(1,5): # try: # x.append(features.cosine(' '.join(q1), ' '.join(q2), n=n)) # except: # x.append(0.0) # try: # x.append(features.cosine(' '.join(q1_lemma), ' '.join(q2_lemma), n=n)) # except: # x.append(0.0) # try: # x.append(features.cosine(' '.join(q1_pos), ' '.join(q2_pos), n=n)) # except: # x.append(0.0) # # # tree kernels # q1_token2lemma = dict(zip(query_question['q1_full'], query_question['q1_lemmas'])) # q2_token2lemma = dict(zip(query_question['q2_full'], query_question['q2_lemmas'])) # q1_tree, q2_tree = utils.parse_tree(query_question['q1_tree'], q1_token2lemma), utils.parse_tree(query_question['q2_tree'], q2_token2lemma) # q1_tree, q2_tree = treekernel.similar_terminals(q1_tree, q2_tree) # x.append(treekernel(q1_tree, q2_tree)) # # # frobenius norm # x.append(features.frobenius_norm(q1_emb, q2_emb)) # # # softcosine simbow = self.simbow.score(q1, q1_emb, q2, q2_emb) x.append(simbow) for comment in query_question['comments']: q3id = comment['id'] q3 = comment['tokens'] simbow_q1q3, simbow_q2q3 = 0, 0 if len(q3) > 0: # x.extend(self.get_features(q1id, q1, q3id, q3)) q3_elmo = self.trainelmo.get(str(self.trainidx[q3id])) q3_w2v = features.encode(q3, self.word2vec) q3_emb = [ np.concatenate([q3_w2v[i], q3_elmo[i]]) for i in range(len(q3_w2v)) ] simbow_q1q3 = self.simbow.score(q1, q1_emb, q3, q3_emb) # simbow_q2q3 = self.simbow.score(q2, q2_emb, q3, q3_emb) # lmprob, trmprob, trlmprob, proctime = self.translation.score_embeddings(q1, q1_emb, q3, q3_emb) # bm25_score = self.bm25_model.get_score(q1, self.bm25_qid_index[comment['id']], self.avg_idf) # x.append(trlmprob) # x.append(bm25_score) x.append(simbow_q1q3) # x.append(simbow_q2q3) X.append(x) y.append(query_question['label']) p.dump(list(zip(X, y)), open(FEATURE_PATH, 'wb')) else: f = p.load(open(FEATURE_PATH, 'rb')) X = list(map(lambda x: x[0], f)) y = list(map(lambda x: x[1], f)) # scale features self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(X) X = self.scaler.transform(X) clf = LassoCV(cv=10) self.feat_selector = SelectFromModel(clf) self.feat_selector.fit(X, y) X = self.feat_selector.transform(X) self.model = self.train_svm(trainvectors=X, labels=y, c='search', kernel='search', gamma='search', degree='search', jobs=4) # self.model = self.train_regression(trainvectors=X, labels=y, c='search', penalty='search', tol='search') logging.info('Finishing to train svm.')
Y = np.concatenate((Positive_y, Negitive_y)) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7) model = XGBClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) thresholds = sort(model.feature_importances_) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) selection_model = XGBClassifier() selection_model.fit(select_X_train, y_train) select_X_test = selection.transform(X_test) y_pred = selection_model.predict(select_X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy * 100.0)) b = sorted(enumerate(model.feature_importances_), key=lambda x: x[1], reverse=True) a = np.array(b)[:, 0][0:MAX_LEN].astype(np.uint8)
[0.82122905 0.83240223 0.81460674 0.8258427 0.85875706] 0.8305675570530682 ''' bagging_clf = BaggingRegressor(lr, n_estimators=10, max_samples=0.8, max_features=1.0, n_jobs=-1) # here we can even set bootstrap=false to get duplicate samples evaluate_model(bagging_clf) from sklearn.feature_selection import SelectFromModel lr = LogisticRegression(C=20, penalty='l2', tol=1e-8) selector = SelectFromModel(lr, threshold='1.25*median') selector.fit(train_x, train_y) train_x2 = selector.transform(train_x) print(train_x.columns[selector.get_support()]) lr.fit(train_x2, train_y) print(lr.score(train_x2, train_y)) print(lr.score(selector.transform(test_x),test_y)) cvs = cross_val_score(lr, selector.transform(train_X), train_Y, cv=5) print(cvs) print(np.mean(cvs), np.std(cvs)) ''' 0.8475120385232745
scaler = MinMaxScaler() data1 = scaler.fit_transform(data[numeric_feature]) #calibrate the categorical features, encoder = OneHotEncoder(categories = 'auto', sparse = False) data2 = encoder.fit_transform(data[categorical_feature]) #merge preprocessed features x = np.append(data1, data2, axis = 1) print('number of features after preprocessing: %d' % len(x[0])) print("") #use extra trees for feature selection clf = ExtraTreesClassifier(n_jobs=-1, random_state=0) clf = clf.fit(x,y) model = SelectFromModel(clf, prefit=True) x = model.transform(x) print('number of features after feature selection: %d' % len(x[0])) print("") #calculuate feature importance and output top 30 features with high importance categorical_name = encoder.get_feature_names(categorical_feature) feature_name = np.append(numeric_feature, categorical_name) feature_importance = clf.feature_importances_ print('average feature importance: %f' % feature_importance.mean()) print("") importance = dict(zip(feature_name, feature_importance)) importance_sorted = sorted(importance.items(), key = lambda x: x[1], reverse=True) print("top 30 features with high importance:") print(importance_sorted[0:30]) print("")
chi_support = chi_selector.get_support() chi_feature = X.loc[:,chi_support].columns.tolist() print(str(len(chi_feature)), 'selected features') from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=num_feats, step=10, verbose=5) rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X.loc[:,rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression embeded_lr_selector = SelectFromModel(LogisticRegression(solver='saga', penalty="l1", max_iter=1000), max_features=num_feats) embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist() print(str(len(embeded_lr_feature)), 'selected features') from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats) embeded_rf_selector.fit(X, y) embeded_rf_support = embeded_rf_selector.get_support() embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist() print(str(len(embeded_rf_feature)), 'selected features')
optimizers = ['rmsprop', 'adam', 'adadelta'] return { 'deep__batch_size': batches, 'deep__epochs': epochs, 'deep__act': activation, 'deep__drop': dropout, 'deep__optimizer': optimizers } for i in range(len(multi_XGB.estimators_)): threshold = np.sort(multi_XGB.estimators_[i].feature_importances_) for thres in threshold: selection = SelectFromModel(multi_XGB.estimators_[i], threshold=thres, prefit=True) select_x_train = selection.transform(x_train) select_x_test = selection.transform(x_test) select_x_pred = selection.transform(x_pred) def build_model(drop=0.5, optimizer='adam', act='relu'): if act == 'leaky': act = leaky inputs = Input(shape=(select_x_train.shape[1], )) x = Dense(51, activation=act)(inputs) x = Dropout(drop)(x) x = Dense(150, activation=act)(x) x = Dropout(drop)(x) x = Dense(300, activation=act)(x)
@简介:对特征进行嵌入式选择 @author: Jian """ import time import pickle from sklearn.feature_selection import SelectFromModel from sklearn.svm import LinearSVC t_start = time.time() """读取特征""" features_path = './data_tfidf_100000.pkl' #tfidf特征的路径 fp = open(features_path, 'rb') x_train, y_train, x_test = pickle.load(fp) fp.close() """进行特征选择""" alo_name = 'lsvc_l2' lsvc = LinearSVC(penalty='l2', C=1.0, dual=True).fit(x_train, y_train) slt = SelectFromModel(lsvc, prefit=True) x_train_s = slt.transform(x_train) x_test_s = slt.transform(x_test) """保存选择后的特征至本地""" num_features = x_train_s.shape[1] data_path = './' + features_path.split( '.')[-2] + '_select_' + alo_name + '_' + str(num_features) + '.pkl' data_f = open(data_path, 'wb') pickle.dump((x_train_s, y_train, x_test_s), data_f) data_f.close() t_end = time.time() print("特征选择完成,选择{}个特征,共耗时{}min".format(num_features, (t_end - t_start) / 60))
def run_grid_pipeline(self, features, labels, standardization_colms, parameters, estimator, feature_selection_threshold_type): # Preprocessing for numerical data numerical_transformer = StandardScaler() # Preprocessing for categorical data categorical_transformer = OneHotEncoder(handle_unknown='ignore') # Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, standardization_colms), # ('cat', categorical_transformer, self.onehot_colms) # ], n_jobs = self.n_jobs) ], n_jobs=self.n_jobs, remainder='passthrough') feature_selection_clf = RandomForestClassifier( random_state=self.random_state, n_jobs=self.n_jobs) feature_selection_model = SelectFromModel( feature_selection_clf, threshold=feature_selection_threshold_type) grid = GridSearchCV(estimator=estimator, param_grid=parameters, cv=5, scoring='accuracy', refit=True, n_jobs=-1) pipeline = Pipeline(steps=[( 'preprocessor', preprocessor), ('feature_selection', feature_selection_model), ('grid_search', grid)]) pipeline.fit(features, labels) def print_results(results): print('BEST PARAMS: {}\n'.format(results.best_params_)) means = results.cv_results_['mean_test_score'] stds = results.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, results.cv_results_['params']): print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params)) print_results(pipeline['grid_search']) # print(features.columns) feature_selection_model = pipeline['feature_selection'] selected_features = feature_selection_model.transform(features) selected_features = pd.DataFrame( feature_selection_model.inverse_transform(selected_features), index=features.index, columns=features.columns) self.selected_columns = selected_features.columns[ selected_features.var() != 0] print( '\nColumns selected for {0} threshold'.format( feature_selection_threshold_type), self.selected_columns) # print('\nBest estimator:\n') # print(pipeline['grid_search'].best_estimator_) # print(pipeline['grid_search'].best_score_) # print(pipeline['grid_search'].best_params_) # print(pipeline['grid_search'].scorer_) return pipeline
y, test_size=0.10) vectorizer = TfidfVectorizer(binary=True, ngram_range=(1, 2)) X_train_rf = vectorizer.fit_transform(X_train) rf = RandomForestClassifier(n_estimators=400, verbose=100, n_jobs=-1, random_state=0, max_samples=5000) rf.fit(X_train_rf, y_train) feature_selector = SelectFromModel(rf, prefit=True, max_features=100000) svc_set = pd.concat([X_train, y_train], axis=1) svc_set = svc_set.sample(100000, random_state=0) svc_X = svc_set['review'] svc_y = svc_set['label'] svc_X = vectorizer.transform(svc_X) svc_X = feature_selector.transform(svc_X) svc = SVC(cache_size=1000, random_state=0) svc.fit(svc_X, svc_y) final_pipe = make_pipeline(vectorizer, feature_selector, svc)
def feature_selection(self): onehot_features = self.original_features onehot_labels = self.original_labels onehot_encoder = OneHotEncoder(handle_unknown='error', sparse=False) onehot_encoder.fit(onehot_features[self.onehot_colms]) onehot_transformed_colms = onehot_encoder.get_feature_names( self.onehot_colms) onehot_transformed_features = onehot_encoder.transform( onehot_features[self.onehot_colms]) onehot_features = onehot_features.join(pd.DataFrame( onehot_transformed_features, index=onehot_features.index, columns=onehot_transformed_colms), how='inner') # print(onehot_features.info()) # print(onehot_transformed_colms) onehot_features = onehot_features.drop(columns=self.onehot_colms) # print(onehot_features.info()) # print(self.original_features.loc[0:5,'Region']) # print(onehot_features.loc[0:5, ['Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_5', 'Region_6', 'Region_7', 'Region_8', 'Region_9'] ] ) sss = StratifiedShuffleSplit(n_splits=1, train_size=self.train_ratio, random_state=self.random_state) for train_indx, test_indx in sss.split(onehot_features, onehot_labels): # print(len(train_indx)/len(features), len(test_indx)/len(features)) # print('% Survived:', labels[test_indx].mean()) # Using RandomForestClassifier gives non-linear decision boundary clf = RandomForestClassifier(random_state=self.random_state, n_jobs=self.n_jobs) # Using LogisticRegression (default L1) gives linear decision boundary # clf = LogisticRegression() clf.fit(onehot_features.iloc[train_indx], onehot_labels.iloc[train_indx]) # Using mean threshold in SelectFromModel feature_selection_model = SelectFromModel(clf, prefit=True, threshold='mean') selected_features = feature_selection_model.transform( onehot_features.iloc[train_indx]) selected_features = pd.DataFrame( feature_selection_model.inverse_transform(selected_features), index=onehot_features.iloc[train_indx].index, columns=onehot_features.iloc[train_indx].columns) self.selected_columns_mean = selected_features.columns[ selected_features.var() != 0] print('Mean threshold:', self.selected_columns_mean) # Using Median threshold for SelectFromModel feature_selection_model = SelectFromModel(clf, prefit=True, threshold='median') selected_features = feature_selection_model.transform( onehot_features.iloc[train_indx]) selected_features = pd.DataFrame( feature_selection_model.inverse_transform(selected_features), index=onehot_features.iloc[train_indx].index, columns=onehot_features.iloc[train_indx].columns) self.selected_columns_median = selected_features.columns[ selected_features.var() != 0] print('Median threshold', self.selected_columns_median)
def gbdt_feature_selection(fe_name, matrix_x_temp, label_y, th): # SelectfromModel clf = GradientBoostingClassifier(n_estimators=200, random_state=100) clf.fit(matrix_x_temp, label_y) sfm = SelectFromModel(clf, prefit=True, threshold=th) matrix_x = sfm.transform(matrix_x_temp) # 打印出有多少特征重要性非零的特征 feature_score_dict = {} for fn, s in zip(fe_name, clf.feature_importances_): feature_score_dict[fn] = s m = 0 for k in feature_score_dict: if feature_score_dict[k] == 0.0: m += 1 print 'number of not-zero features:' + str(len(feature_score_dict) - m) # 打印出特征重要性 feature_score_dict_sorted = sorted(feature_score_dict.items(), key=lambda d: d[1], reverse=True) print 'feature_importance:' for ii in range(len(feature_score_dict_sorted)): print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][ 1] print '\n' f = open('../eda/A_gbdt_feature_importance.txt', 'w') f.write(th) f.write('\nRank\tFeature Name\tFeature Importance\n') for i in range(len(feature_score_dict_sorted)): f.write( str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') f.close() # 打印具体使用了哪些字段 how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 feature_used_dict_temp = feature_score_dict_sorted[:how_long] feature_used_name = [] for ii in range(len(feature_used_dict_temp)): feature_used_name.append(feature_used_dict_temp[ii][0]) print 'feature_chooesed:' for ii in range(len(feature_used_name)): print feature_used_name[ii] print '\n' f = open('../eda/A_gbdt_feature_chose.txt', 'w') f.write('Feature Chose Name :\n') for i in range(len(feature_used_name)): f.write(str(feature_used_name[i]) + '\n') f.close() # 找到未被使用的字段名 feature_not_used_name = [] for i in range(len(fe_name)): if fe_name[i] not in feature_used_name: feature_not_used_name.append(fe_name[i]) # 生成一个染色体(诸如01011100这样的) chromosome_temp = '' feature_name_ivar = fe_name[:-1] for ii in range(len(feature_name_ivar)): if feature_name_ivar[ii] in feature_used_name: chromosome_temp += '1' else: chromosome_temp += '0' print 'Chromosome:' print chromosome_temp joblib.dump(chromosome_temp, '../config/chromosome.pkl') print '\n' return matrix_x, feature_not_used_name[:], len(feature_used_name)
""" # #加载库 from sklearn.ensemble import RandomForestClassifier from sklearn import datasets from sklearn.feature_selection import SelectFromModel import numpy as np #加载数据 iris = datasets.load_iris() features = iris.data target = iris.target #创建随机森林分类器对象 randomforest = RandomForestClassifier(random_state=0, n_jobs=-1) #创建对象,选择重要性大于或等于阈值的特征 selector = SelectFromModel(randomforest, threshold=0.3) #使用选择器创建新的特征矩阵 features_important = selector.fit_transform(features, target) #使用重要特征训练随机森林模型 model = randomforest.fit(features_important, target) #计算特征的重要性 importances = model.feature_importances_ #查看模型中每个特征的重要程度 print(importances) #将特征的重要性按降序排列 indices = np.argsort(importances)[::-1] #按照特征的重要性对特征名称重新排序 names = [iris.feature_names[i] for i in indices]
fit_mod = sel.fit(data_np, target_np) print(sel.ranking_) sel_idx = fit_mod.get_support() if fs_type == 2: #Wrapper Select via model if binning == 0: clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=3, min_samples_leaf=1, max_features=None, random_state=rand_st) sel = SelectFromModel( clf, prefit=False, threshold='mean', max_features=None ) #to select only based on max_features, set to integer value and set threshold=-np.inf print('Wrapper Select: ') if binning == 1: rgr = '''Unused in this homework''' sel = SelectFromModel(rgr, prefit=False, threshold='mean', max_features=None) print('Wrapper Select: ') fit_mod = sel.fit(data_np, target_np) sel_idx = fit_mod.get_support() if fs_type == 3: if binning == 1: ######Only work if the Target is binned###########
df = df.set_index('trace:id') df_join = df2['status'] df_all = df.join(df_join, how='inner') df_all = df_all.dropna() y = df_all.pop('status') df_all = df_all.drop('qr', axis=1) print(y) print(len(df_all)) # plot high dim data plot_data(df_all, y, 'TSNE', 2) #print(df_all.columns) pipe = make_pipeline( SelectFromModel(estimator=RandomForestClassifier( n_estimators=100, max_depth=2, random_state=0)), LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=5000)) lr = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=5000, class_weight='balanced') #rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) rf = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=0, class_weight={ 'nok': 4, 'ok': 1 }) print(cross_val_score(lr, df_all, y, scoring='accuracy', cv=5).mean()) print(cross_val_score(rf, df_all, y, scoring='accuracy', cv=5).mean())
error_train = mean_squared_error(y_tr, y_tr_pred) error_test = mean_squared_error(y_ts, y_ts_pred) error_std_train = mean_squared_error(y_std_tr, y_std_tr_pred) error_std_test = mean_squared_error(y_std_ts, y_std_ts_pred) print("---------------------------------------") print("# Mean Squared Error:") print(regressor_name + " MSE train: %.3f, test: %.3f" % (error_train, error_test)) print(regressor_name + " STD MSE train: %.3f, test: %.3f" % (error_std_train, error_std_test)) # Performance improvement print("\n\n\n======================") print("PERFORMANCE IMPROVEMENT") clf = LassoCV(cv=5) sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(x, y) n_features = sfm.transform(x).shape[1] while n_features > 4: sfm.threshold += 0.1 x_new = sfm.transform(x) n_features = x_new.shape[1] # Standardizing sc_x = StandardScaler() x_std_new = sc_x.fit_transform(x_new) sc_y = StandardScaler() y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten() # Splitting train and test data
drop=False, nan=False), 'clipper': OutliersClipper(columns=['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']), 'combinations': FeatureProduct(columns=['LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea']), 'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'], 'main_imputer': HotDeckFullImputer(col_k_pairs=[('LotFrontage', None), ('MasVnrArea', None), ('GarageYrBlt', None)], default_k=5), 'poly': PolynomialsAdder(powers_per_column={'LotFrontage': [2], 'LotArea': [2], 'MasVnrArea': [2], 'BsmtFinSF1': [2], 'BsmtFinSF2': [2], 'BsmtUnfSF': [2], 'TotalBsmtSF': [2], '1stFlrSF': [2], '2ndFlrSF': [2], 'LowQualFinSF': [2], 'GrLivArea': [2], 'GarageArea': [2], 'WoodDeckSF': [2], 'OpenPorchSF': [2], 'EnclosedPorch': [2], '3SsnPorch': [2], 'ScreenPorch': [2], 'PoolArea': [2], 'MiscVal': [2]}), 'predictor': DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best'), 'reduce_dim': SelectFromModel(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False), norm_order=1, prefit=False, threshold=None), 'simple_imputer': FillNaTransformer(from_dict={}, mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[], nan_flag=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], zero=[])}, 'score': 37866.96889728187, 'std': 5359.25597193946}, 'Lasso': {'params': {'binner': None, 'binner2': CustomBinaryBinner(configuration={'LotFrontage': {'values': [182.0]}, 'LotArea': {'values': [215245]}, 'MasVnrArea': {'values': [1378.0]}, 'BsmtFinSF1': {'values': [2188]}, 'BsmtFinSF2': {'values': [1120]}, 'BsmtUnfSF': {'values': [2336]}, 'TotalBsmtSF': {'values': [3206]}, '1stFlrSF': {'values': [3228]}, '2ndFlrSF': ... [2010.0]}, 'GarageCars': {'values': [4]}, 'MoSold': {'values': [12]}, 'YrSold': {'values': [2010]}}, drop=False, nan=False), 'clipper': None, 'combinations': FeatureProduct(columns=['LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea']), 'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'], 'main_imputer': HotDeckFullImputer(col_k_pairs=[('LotFrontage', None), ('MasVnrArea', None), ('GarageYrBlt', None)],
train['Trap'] = lbl.transform(train['Trap'].values) lbl.fit(list(train['CodeSum_x'].values)) # + list(test['CodeSum_x'].values)) train['CodeSum_x'] = lbl.transform(train['CodeSum_x'].values) lbl.fit(list(train['CodeSum_y'].values)) # + list(test['CodeSum_y'].values)) train['CodeSum_y'] = lbl.transform(train['CodeSum_y'].values) ######################################################################################################################## train = train.astype(float) #train = train.loc[:,(train != -1).any(axis=0)] label = train.WnvPresent train = train.drop('WnvPresent', axis=1) sfm = SelectFromModel(LinearSVC(penalty='l1', loss='squared_hinge', dual=False)) data = sfm.fit_transform(train, label) data = preprocessing.scale(data) #data = preprocessing.scale(train) transformer = FunctionTransformer(np.log1p, validate=True) transformer.transform(data) data = preprocessing.normalize(data, norm='l2') feature_cols = train.columns databackup = data data = pd.DataFrame(sfm.inverse_transform(data), index=train.index, columns=feature_cols) selCols = data.columns[data.var() != 0] data = data[selCols]
def main(): # Input datasets as pandas dataframes df_original = pd.read_csv('Predictive Modelling Train.txt', decimal=",", sep="|") df_original_predict = pd.read_csv('Predictive Modelling Test.txt', decimal=",", sep="|") #---------------------------------------------------------------------------# # 3- DATA MANIPULATION # #---------------------------------------------------------------------------# # Train - Drop ID and remove duplicates in train set df_original = df_original.drop(['ID'], 1) df_original = df_original.drop_duplicates() # Predict - Save ID of dataframe to predict test_id = df_original_predict.ID # Predict - Drop ID and convert to numpy array test_preprocessed = df_original_predict.drop(["ID"], 1) X_predict = np.array(test_preprocessed) # Train - Separate class and features X = np.array(df_original.drop(['TARGET'], axis=1)) y = np.array(df_original.TARGET.values) # Split train dataset X_to_balance, X_real_test, y_to_balance, y_real_test = train_test_split( X, y, test_size=test_size_value, random_state=random_state_value) # Oversample data (TARGET=1) to balance sm = SMOTE(kind='regular') X_balanced, y_balanced = sm.fit_sample(X_to_balance, y_to_balance) # Create new features df_real = feature_engineering_df(X_real_test, df_original) df_balanced = feature_engineering_df(X_balanced, df_original) df_test_processed = feature_engineering_df(X_predict, df_original) # Convert pandas dataframes to numpy arrays X_balanced = np.array(df_balanced) X_real_test = np.array(df_real) X_predict = np.array(df_test_processed) #---------------------------------------------------------------------------# # 4- FEATURE SELECTION # #---------------------------------------------------------------------------# # Define classifier for feature importance and selection clf1 = ExtraTreesClassifier(n_jobs=-1, random_state=random_state_value) selector = clf1.fit(X_balanced, y_balanced) # Choose best features fs = SelectFromModel(selector, prefit=True) # Discard non selected features X_real_test = fs.transform(X_real_test) X_balanced = fs.transform(X_balanced) X_predict_final = fs.transform(X_predict) #---------------------------------------------------------------------------# # 5- MODEL TRAIN + FIT # #---------------------------------------------------------------------------# # Define prediction classifier and fit clf2 = KNeighborsClassifier(n_jobs=-1, n_neighbors=9) clf2.fit(X_balanced, y_balanced) #---------------------------------------------------------------------------# # 6- MODEL EVALUATION # #---------------------------------------------------------------------------# # !!! IMPORTANT: COMMENT WHOLE BLOCK WHEN DOING REAL TRAINING AND PREDCTION (test_size_value = 0) # Print used classifiers and their parameters print("Feature selection classifier: ", clf1, "\n") print("Model classifier: ", clf2, "\n") # Calculate predictions y_pred = clf2.predict_proba(X_real_test)[:, 1] y_pred_int = clf2.predict(X_real_test) # Evaluate model print("Roc AUC: ", roc_auc_score(y_real_test, y_pred, average='macro')) accuracy = clf2.score(X_real_test, y_real_test) print("Accuracy: ", accuracy) print("f1 Score: ", f1_score(y_real_test, y_pred_int, average='macro')) # Hardcoded benchmark of filling prediction with most common class (0) zeros_benchmark = 1 - 7477 / 459992 print("Filling with 0's benchmark: ", zeros_benchmark) # Fixed accuracy with zeros_benchmark print("Fixed accuracy with benchmark: ", (accuracy - zeros_benchmark) / (1 - zeros_benchmark)) # Confusion matrix conf_matrix = confusion_matrix(y_real_test, y_pred_int) print("\n[Confusion Matrix]: \n", conf_matrix) print( "\n------------------------------------------------------------------\n\n" ) #---------------------------------------------------------------------------# # 7- PREDICTION AND SUBMISSION # #---------------------------------------------------------------------------# # Make prediction predict_submission = clf2.predict(X_predict_final) # Save in csv submission = pd.DataFrame({"ID": test_id, "TARGET": predict_submission}) submission.to_csv("submission.csv", index=False, sep="|")
from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression, LogisticRegressionCV clf = RandomForestClassifier(n_estimators=50, max_features='sqrt') clf = clf.fit(data[:train_objs_num], y) features = pd.DataFrame() features['feature'] = data.columns features['importance'] = clf.feature_importances_ features.sort_values(by=['importance'], ascending=True, inplace=True) features.set_index('feature', inplace=True) features.plot(kind='barh', figsize=(25, 25)) from sklearn.feature_selection import SelectFromModel model = SelectFromModel(clf, prefit=True) train_reduced = model.transform(data[:train_objs_num]) test_reduced = model.transform(data[train_objs_num:]) print(train_reduced.shape,test_reduced.shape) from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from sklearn.feature_selection import SelectKBest from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression, LogisticRegressionCV logreg = LogisticRegression()
idx.append(k) # 计算这一类特征的权值系数均值 mean = coef / len(idx) self.coef_[i][idx] = mean return self import scipy.io as sio image_path = r'./mat/simple.mat' image_D = sio.loadmat(image_path) X = image_D['dataset'] y = image_D['label2'] ''' iris = load_iris() X, y = iris.data, iris.target y.resize((150,1)) y = np.hstack((y,np.zeros((150,1)))) ''' from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel X2 = np.delete(X,range(16,32,1),1) model = ExtraTreesClassifier() model.fit(X2, y) print(model.feature_importances_) #带L1和L2惩罚项的逻辑回归作为基模型的特征选择 #参数threshold为权值系数之差的阈值 a = SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(X, y) lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y) a = SelectFromModel(lsvc,prefit=True) a = a.transform(X) print("X_new 共有 %s 个特征"%a.shape[1])
madelonY = madelon['Class'].copy().values adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split( adultX, adultY, test_size=0.3, random_state=0, stratify=adultY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) pipeA = Pipeline([('Scale', StandardScaler()), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) d = adultX.shape[1]