def predict_TestData(Food_df,People_df): cTrainF = rand(len(Food_df)) > .5 cTestF = ~cTrainF cTrainP = rand(len(People_df)) > .5 cTestP = ~cTrainP TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0) TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0) TrainX= TrainX_df.ix[:,2:].values TestX= TestX_df.ix[:,2:].values TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))]) TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))]) ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0),TrainY) ET_prediction = ET_classifier.predict(TestX) LinSVC_classifier = svm.LinearSVC(),TrainY) LinSVC_predict = LinSVC_classifier.predict(TestX) a=DataFrame() a["url"]=TestX_df.urls.values a["answer"]=TestY a["ET_predict"]=ET_prediction a["LinSVC_predict"]=LinSVC_predict a.to_csv("prediction_for_TestData.csv")
def tree(train_data, train_labels, all_bigrams, task): forest = ExtraTreesClassifier(n_estimators=100, random_state=0), train_labels) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print "-"*45 print task for f in range(20): print("%d. feature, name: %s, importance: %f" % (f + 1, all_bigrams[indices[f]], importances[indices[f]])) # Plot the feature importances of the forest pl.figure() n = train_data.shape[1] n = 2000 pl.title("Sorted feature importance for %s" %(task)), importances[indices][:n], color="black", align="center") pl.xlim([0, (n)]) pl.xticks([num for num in range(0, n+1, 250)]) pl.savefig(task+'.pdf', bbox_inches='tight') print "plot saved" return indices
def main(): # Define the known data points or "training" data explanatory_fields = "d100 dd0 dd5 fday ffp gsdd5 gsp map mat_tenths mmax_tenths mmindd0 mmin_tenths mtcm_tenths mtwm_tenths sday".split() explanatory_rasters = [os.path.join(TRAINING_DIR, "current_" + r + ".img") for r in explanatory_fields] response_shapes = os.path.join(TRAINING_DIR, "DF.shp") # Load the training rasters using the sampled subset try: cached = json.load(open("_cached_training.json")) train_xs = np.array(cached['train_xs']) train_y = np.array(cached['train_y']) except IOError: train_xs, train_y = load_training_vector(response_shapes, explanatory_rasters, response_field='GRIDCODE') cache = {'train_xs': train_xs.tolist(), 'train_y': train_y.tolist()} with open("_cached_training.json", 'w') as fh: fh.write(json.dumps(cache)) print(train_xs.shape, train_y.shape) # Train the classifier clf = ExtraTreesClassifier(n_estimators=120, n_jobs=3), train_y) print(clf) evaluate_clf(clf, train_xs, train_y, feature_names=explanatory_fields)
def eval_param(params): """Evaluation of one set of xgboost's params. Then, use 3 folds as training and cv in a row as xgboost's watchlist with an early_stop at 50. """ global df_results, train, target, test print ("Training with params : ") print (params) random_state = 42 avg_score = 0. n_folds = 3 predict = np.zeros(test.shape[0]) #dtest = xgb.DMatrix(test) skf = StratifiedKFold(target, n_folds=n_folds, random_state=random_state) for train_index, cv_index in skf: # train x_train, x_cv = train[train_index], train[cv_index] y_train, y_cv = target[train_index], target[cv_index] clf = ExtraTreesClassifier(**params).fit(x_train, y_train) #bst = xgb.train(params, dtrain, num_round, watchlist, early_stopping_rounds=early_stopping_rounds, maximize=True) # test / score predict_cv = clf.predict_proba(x_cv, y_cv)#bst.predict(dvalid, ntree_limit=bst.best_iteration) avg_score += -log_loss(y_cv, predict_cv) predict += clf.predict_proba(test)#bst.predict(dtest, ntree_limit=bst.best_iteration) predict /= n_folds avg_score /= n_folds # store new_row = pd.DataFrame([np.append([avg_score], list(params.values()))], columns=np.append(['score'], list(params.keys()))) df_results = df_results.append(new_row, ignore_index=True) np.savetxt('hyperopt_preds/pred' + str(df_results.index.max()) + '.txt', predict, fmt='%s') df_results.to_csv('hyperopt_results_sgd.csv') print ("\tScore {0}\n\n".format(avg_score)) return {'loss': - avg_score, 'status': STATUS_OK}
def calc_prob(df_features_driver, df_features_other): df_train = df_features_driver.append(df_features_other) df_train.reset_index(inplace = True) df_train.Driver = df_train.Driver.astype(int) # So far, the best result was achieved by using a RandomForestClassifier with Bagging # model = BaggingClassifier(base_estimator = ExtraTreesClassifier()) # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1)) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = AdaBoostClassifier()) #model = RandomForestClassifier(200) # model = BaggingClassifier(base_estimator = [RandomForestClassifier(), linear_model.LogisticRegression()]) # model = EnsembleClassifier([BaggingClassifier(base_estimator = RandomForestClassifier()), # GradientBoostingClassifier]) #model = GradientBoostingClassifier(n_estimators = 10000) model = ExtraTreesClassifier(n_estimators=100,max_features='auto',random_state=0, n_jobs=2, criterion='entropy', bootstrap=True) # model = ExtraTreesClassifier(500, criterion='entropy') feature_columns = df_train.iloc[:, 4:] # Train the classifier, df_train.Driver) df_submission = pd.DataFrame() df_submission['driver_trip'] = create_first_column(df_features_driver) probs_array = model.predict_proba(feature_columns[:200]) # Return array with the probability for every driver probs_df = pd.DataFrame(probs_array) df_submission['prob'] = np.array(probs_df.iloc[:, 1]) return df_submission
def learn(f): global raw_data print 'testing classifier' data = raw_data[raw_data['label'] != 'unknown'] data = data[data['file type'] == 'EXECUTE'] X = data.as_matrix(f) y = np.array(data['label'].tolist()) #clf = RandomForestClassifier(n_estimators=100) clf = ExtraTreesClassifier(n_estimators=100) #clf = AdaBoostClassifier() scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10) print("predicted accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) seed = 3301 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed), y_train) scores = clf.score(X_test, y_test) print("actual accuracy: %0.2f" % scores) importances = zip(f, clf.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for im in importances[0:20]: print im[0].ljust(30), im[1] #y_pred = clf.predict(X_test) #labels = ['good', 'bad'] #cm = confusion_matrix(y_test, y_pred, labels) #plot_cm(cm, labels) #joblib.dump(clf, 'model.pkl') return clf
def doTreeFeatureSelection(estimator, X, y): clf = ExtraTreesClassifier() clf =, y) #print str(clf.feature_importances_) model = SelectFromModel(clf, prefit=True) return model
def ET_classif(features_df=None, labels_df=None): '''Scoring function to be used in SelectKBest feature selection class object. This scoring function assigns varaible importances to the features passed in to it using the ExtraTreesClassifier. It then returns the features as two identical arrays mimicking the scores and p-values arrays required by SelectKBest to pick the top K features. Args: features_df: Pandas dataframe of features to be used to predict using the ExtraTreesClassifier. labels_df: Pandas dataframe of the labels being predicted. Returns: Two identical arrays containing the feature importance scores returned for each feature by the ExtraTreesClassifier. ''' reducer = ExtraTreesClassifier(n_estimators=500, bootstrap=False, oob_score=False, max_features=.10, min_samples_split=10, min_samples_leaf=2, criterion='gini', random_state=42), labels_df) return reducer.feature_importances_, reducer.feature_importances_
def feature_engineering_common(Y, X, X1): print "### Shape of training set (X)", X.shape print "### Shape of labels (Y)", Y.shape print "### Shape of Kaggle Test set (X1)", X1.shape # Scale features scaler = preprocessing.StandardScaler() X_SCALED = scaler.fit_transform(X) X1_SCALED = scaler.transform(X1) print "### (After scaling) Shape of training set", X_SCALED.shape print "### (After scaling ) Shape of Kaggle Test set", X1_SCALED.shape # Find Important Features using Random Forest xtClf = ExtraTreesClassifier().fit(X_SCALED, Y) X_SCALED_SUBSET = xtClf.transform(X_SCALED) X1_SCALED_SUBSET = xtClf.transform(X1_SCALED) importances = xtClf.feature_importances_ print xtClf.feature_importances_ print "### (After scaling & feature selection using Random Forrest) Shape of training set", X_SCALED_SUBSET.shape print "### (After scaling & feature selection using Random Forrest) Shape of Kaggle Test set", X1_SCALED_SUBSET.shape indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in xrange(10): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
def train_random_forest(X_train,y_train,**kwargs): from sklearn.ensemble import ExtraTreesClassifier n_estimators = kwargs.pop('n_estimators',300) max_features = kwargs.pop('max_features','auto') n_jobs = kwargs.pop('n_jobs',-1) verbose = kwargs.pop('verbose',0) tuned_params = kwargs.pop('tuned_params',None) # initialize baseline classifier clf = ExtraTreesClassifier(n_estimators=n_estimators,random_state=42, n_jobs=n_jobs,verbose=verbose,criterion='gini', max_features=max_features,oob_score=True, bootstrap=True) if tuned_params is not None: # optimize if desired from sklearn.grid_search import GridSearchCV cv = GridSearchCV(clf,tuned_params,cv=5,scoring='roc_auc', n_jobs=n_jobs,verbose=verbose,refit=True), y_train) clf = cv.best_estimator_ else: # otherwise train with the specified parameters (no tuning),y_train) return clf
def feature_important(filename): from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier content = read_csv(filename) X = [c.decisions for c in content] y = [c.objective for c in content] # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0), y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") # for f in range(len(X[0])): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances")[0])), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(len(X[0])), indices) plt.xlim([-1, len(X[0])])
def tree_based_selection(self, data_set, data_target, feature_names): """ :param data_set: :return: """ clf = ExtraTreesClassifier() clf =, data_target) print clf.feature_importances_ model = SelectFromModel(clf, prefit=True) feature_set = model.transform(data_set) fea_index = [] for A_col in np.arange(data_set.shape[1]): for B_col in np.arange(feature_set.shape[1]): if (data_set[:, A_col] == feature_set[:, B_col]).all(): fea_index.append(A_col) check = {} for i in fea_index: check[feature_names[i]] = data_set[0][i] print np.array(check) return feature_set, fea_index
def top_importances(features_df=None, labels_df=None, top_N=10): ''' Finds the top N importances using the ExtraTreesClassifier. Finds the top N importances of a dataframe of features and a dataframe of labels using the ExtraTreesClassifier. Args: features_df: Pandas dataframe of features used to predict. labels_df: Pandas dataframe of labels to be predicted. top_N: interger value of the top N most importance features to return. Returns: Pandas dataframe containing the top N importances and their importance scores. ''' reducer = ExtraTreesClassifier(n_estimators=2000, bootstrap=False, oob_score=False, max_features=.10, min_samples_split=10, min_samples_leaf=2, criterion='gini'), labels_df) scores = pd.DataFrame(reducer.feature_importances_, index=features_df.columns) scores.columns = ['Importances'] scores = scores.sort(['Importances'], ascending=False) return scores[0:top_N]
def crossVal(positions, X, y, missedYFile): outF = open(missedYFile, 'w') posArray = np.array(positions) # Split into training and test sss = StratifiedShuffleSplit(y, 4, test_size=0.1, random_state=442) cvRound = 0 for train_index, test_index in sss: clf = ExtraTreesClassifier(n_estimators=300, random_state=13, bootstrap=True, max_features=20, min_samples_split=1, max_depth=8, min_samples_leaf=13, n_jobs=4 ) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] pos_test = posArray[test_index] clf =, y_train) preds = clf.predict(X_test) metrics.confusion_matrix( y_test, preds ) print( metrics.classification_report(y_test, clf.predict(X_test)) ) for loc,t,p in zip(pos_test, y_test, preds): if t=='0' and p=='1': print >> outF, loc + '\t' + str(cvRound) cvRound += 1 outF.close()
class FeaturesSelectionRandomForests(object): def __init__(self, n_estimators = 100, feature_importance_th = 0.005): self.n_estimators = n_estimators self.feature_importance_th = feature_importance_th def fit(self, X, y, n_estimators = None, feature_importance_th = None): if n_estimators is not None: assert isinstance(n_estimators,(int,long,float)) self.n_estimators = n_estimators if feature_importance_th is not None: assert isinstance(feature_importance_th,(int,long,float)) self.feature_importance_th = feature_importance_th #filter features by forest model self.trees = ExtraTreesClassifier(n_estimators=100, compute_importances=True), y) self.features_mask = np.where(self.trees.feature_importances_ > 0.005)[0] def plot_features_importance(self): pd.DataFrame(self.trees.feature_importances_).plot(kind='bar') def transform(self, X): assert hasattr(self,"features_mask") return X[:, self.features_mask]
def tree_based_feature_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray: n = len(self.features) forest = ExtraTreesClassifier(n_estimators=250, random_state=0), y) importances = forest.feature_importances_ print(importances) std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] print("Feature ranking:") for f in range(n): print("%d. feature %d: %s (%f)" % (f + 1, indices[f], self.features[indices[f]],importances[indices[f]])) # Plot the feature importances of the forest # plt.figure() # plt.title("Feature importances") #, importances[indices], # color="r", yerr=std[indices], align="center") # plt.xticks(range(n), indices) # plt.xlim([-1, n]) # n = 12 print(indices[0:n+1]) print(self.features[indices[0:n+1]]) new_x = x[:, indices[0:n+1]] return new_x
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) #y_pred/=m; clf=ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 3, max_depth= 60, min_samples_leaf= 4,verbose=1,n_jobs=-1) #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100),(y_train_cv)) y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def plotImportance(X,y): forest = ExtraTreesClassifier(n_estimators=250, random_state=0), y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] n=X.shape[1] #Print the feature ranking #print("Feature ranking:") #for f in range(n): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure(figsize=(20,15)) plt.title("Feature importances"), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(n), X.columns[indices],rotation=90) plt.xlim([-1, n]) plt.savefig('featuresel.pdf')
def train_classifiers(X_data, y_data): ############ Linear SVM: 0.908 ############# clf_LSVM = svm.SVC(kernel = 'linear'), y_data) ############ MultinomialNB: 0.875 ############# clf_MNB = MultinomialNB(), y_data) ############ Random Forest: 0.910 ############# clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy'), y_data) ############ Extra Tree: 0.915 ################## clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0), y_data) ############ AdaBoost: 0.88 ################## clf_Ada = AdaBoostClassifier(), y_data) ############ rbf SVM: 0.895 ############# clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf'), y_data) ############ GradientBoosting: 0.88 ############# clf_GBC = GradientBoostingClassifier(), y_data) return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC
def get_important_features(Xtrain, Ytrain, n=250, threshold=0.01, verbose=False): """ Use entirety of provided X, Y to train random forest Arguments Xtrain -- Training data Ytrain -- Training prediction Optional Arguments n -- number of ensemble members threshold -- threshold of importance above which a feature is relevant verbose -- if true, prints results of ranking Returns ranking -- a ranked list of indices of important features """ # Train and fit tree classifier ensemble classifier = ExtraTreesClassifier(n_estimators=n, random_state=0), Ytrain) # Compute important features importances = classifier.feature_importances_ std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0) indices = np.argsort(importances)[::-1] ranking = [[indices[f], importances[indices[f]]] for f in range(Xtrain.shape[1])] ranking = filter(lambda r: r[1] >= threshold, ranking) if verbose: for r in range(len(ranking)): print str(r+1) + ". ", ranking[r][0], ranking[r][1] return ranking
def get_most_important_features(train): train = train.drop('ID', 1) train_y = train['TARGET'] train_X = train.drop('TARGET', 1) random_forest = RandomForestClassifier(n_estimators=100), train_y) feater_importance = pd.Series(random_forest.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(15 ,7), title='Feature importance by random forest') # plt.savefig("feature_importance.png") grad_boosting = GradientBoostingClassifier(), train_y) feater_importance = pd.Series(grad_boosting.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(10,7), title='Feature importance by gradient boosting') # plt.savefig("feature_importance2.png") extra_trees = ExtraTreesClassifier(), train_y) feater_importance = pd.Series(extra_trees.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(20,7), title='Feature importance by extra trees classifier')
def remove_feature_tree_based(train_X,train_Y): ''' Removes features based on trees - see sklearn: Actually removes based on "importance" ''' forest = ExtraTreesClassifier(n_estimators=1000, compute_importances = True, random_state = 0), train_Y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] x_labels = ['rc1', 'rc2', 'dca1', 'dca2','dcm1', 'dcm2','ace1','ace2','acsc1', 'acsc2', 'acsv1', 'acsv2', 'acss1','acss2', 'acsk1', 'acsk2', 'taca1', 'taca2', 'tdc1', 'tdc2', 'gmin', 'gmean', 'trd','ep111','ep112','ep211', 'ep212', 'ep311','ep312', 'ep411','ep412','ep511','ep512','ep611','ep612','ep121','ep122','ep221', 'ep222', 'ep321','ep322', 'ep421','ep422','ep521','ep522','ep621','ep622'] # Print the feature ranking print "Feature ranking:" for f in xrange(46): print "%d. feature %s (%f)" % (f + 1, x_labels[indices[f]], importances[indices[f]]) # Transform the data to have only the features that are important x_new = forest.transform(train_X) return (forest, x_new)
def FeaturesImportance(trainData, trainLabels): forest = ExtraTreesClassifier(n_estimators=250, random_state=0), trainLabels) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(16): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances"), importances[range(16)], color="r", align="center") plt.xticks(range(16), [r'$x_1$', r'$x_2$', r'$x_3$', r'$x_4$', r'$x_5$', r'$x_6$', r'$x_7$', r'$x_8$', r'$x_9$', r'$x_{10}$', r'$x_{11}$', r'$x_{12}$', r'$x_{13}$', r'$x_{14}$', r'$x_{15}$', r'$x_{16}$']) plt.yticks([0.0, 0.05, 0.10, 0.15, 0.20, 0.25], [r'$0.00$', r'$0.05$', r'$0.10$', r'$0.15$', r'$0.20$', r'$0.25$']) plt.xlabel('Features') plt.ylabel('Importance') plt.xlim([-1, 16]) return importances
def reduceRF(label): global x_data_rf_reduced, importantFeatureLocs model = ExtraTreesClassifier(), y_data[:, label]) # the relative importance of each attribute importance = model.feature_importances_ weight = float(0) del importantFeatureLocs[:] # reset #print(importance) for ele in np.sort(importance)[::-1]: weight += float(ele) featureIndex = np.where(importance==ele) for loc in featureIndex[0]: importantFeatureLocs.append(loc) if weight > RFThreshold : break # remove duplications importantFeatureLocs = list(set(importantFeatureLocs)) # extracting relevant columns from input data. Note that importantFeatureLocs # may be unsorted (since python 'set' is unsorted), so features are extracted # in unorderd fashion. This info is stored in the softmax model class x_data_rf_reduced = x_data[:, importantFeatureLocs]
def fit(self, X, Y, sample_weight=None): from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel num_features = X.shape[1] max_features = int(float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) estimator = ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight, ), Y, sample_weight=sample_weight) self.preprocessor = SelectFromModel(estimator=estimator, threshold="mean", prefit=True) return self
def extratreeclassifier(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = ExtraTreesClassifier(n_estimators=10),y_train) y_pred = clf.predict(X_test) print "Extremely Randomized Trees" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"_Extremely_Random_Forest_metrics_test.txt" file = open(results, "w") file.write("Extremely Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Extremely Randomized Trees %f"%test_size save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def _cascade_layer(self, X, y=None, layer=0): n_tree = getattr(self, 'n_cascadeRFtree') n_cascadeRF = getattr(self, 'n_cascadeRF') min_samples = getattr(self, 'min_samples_cascade') prf = RandomForestClassifier( n_estimators=100, max_features=8, bootstrap=True, criterion="entropy", min_samples_split=20, max_depth=None, class_weight='balanced', oob_score=True) crf = ExtraTreesClassifier( n_estimators=100, max_depth=None, bootstrap=True, oob_score=True) prf_pred = [] if y is not None: # print('Adding/Training Layer, n_layer={}'.format(self.n_layer)) for irf in range(n_cascadeRF):, y), y) setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf) setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf) probas = prf.oob_decision_function_ probas += crf.oob_decision_function_ prf_pred.append(probas) elif y is None: for irf in range(n_cascadeRF): prf = getattr(self, '_casprf{}_{}'.format(layer, irf)) crf = getattr(self, '_cascrf{}_{}'.format(layer, irf)) probas = prf.predict_proba(X) probas += crf.predict_proba(X) prf_pred.append(probas) return prf_pred
class MyExtraTree(MyClassifier): def __init__(self, params=dict()): self._params = params self._extree = ExtraTreesClassifier(**(self._params)) def update_params(self, updates): self._params.update(updates) self._extree = ExtraTreesClassifier(**(self._params)) def fit(self, Xtrain, ytrain):, ytrain) # def predict(self, Xtest, option = None): # return self._extree.predict(Xtest) def predict_proba(self, Xtest, option = None): return self._extree.predict_proba(Xtest)[:, 1] def predict_proba_multi(self, Xtest, option = None): return self._extree.predict_proba(Xtest) def plt_feature_importance(self, fname_list, f_range = list()): importances = self._extree.feature_importances_ std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0) indices = np.argsort(importances)[::-1] fname_array = np.array(fname_list) if not f_range: f_range = range(indices.shape[0]) n_f = len(f_range) plt.figure() plt.title("Extra Tree Feature importances") plt.barh(range(n_f), importances[indices[f_range]], color="b", xerr=std[indices[f_range]], ecolor='k',align="center") plt.yticks(range(n_f), fname_array[indices[f_range]]) plt.ylim([-1, n_f]) def list_feature_importance(self, fname_list, f_range = list(), return_list = False): importances = self._extree.feature_importances_ indices = np.argsort(importances)[::-1] print 'Extra tree feature ranking:' if not f_range : f_range = range(indices.shape[0]) n_f = len(f_range) for i in range(n_f): f = f_range[i] print '{0:d}. feature[{1:d}] {2:s} ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]]) if return_list: return [indices[f_range[i]] for i in range(n_f)]
def select_with_forest(X, y, n_trees=10, treshold=0.01): from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import ExtraTreesClassifier import pandas as pd import numpy as np # encode labels (str -> int): le = LabelEncoder() X = X.copy() for col in X.columns:[col].unique()) X[col] = le.transform(X[col]) # train the classifier: forest = ExtraTreesClassifier(criterion="entropy", n_estimators=n_trees), y) print('number of selected features: ', np.sum(forest.feature_importances_ >= treshold)) # select important features: importances = pd.DataFrame() importances['predictor name'] = X.columns.tolist() importances['importance'] = forest.feature_importances_ importances = importances.sort_values(by='importance', ascending=False) #X2 = forest.transform(X, treshold) #labels2 = X.columns[list(forest.feature_importances_>=treshold)] #X2 = pd.DataFrame(X2) #X2.columns = labels2 return importances #X2
def algo_fit_cross_validated(training_matrix, target): # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0), target) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] l = list(training_matrix.columns.values) for f in range(training_matrix.shape[1]): print("%d. feature %d(%s) (%f)" % (f + 1, indices[f], l[indices[f]], importances[indices[f]])) ##### Works well ###### # SVM # svm = SVC(kernel="linear", C=0.06) #, target) # # scores_svm = cross_validation.cross_val_score(svm, training_matrix, target, cv=5) # print("(svm) Accuracy: %0.5f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2)) # # return svm ##### Works well ###### # Random Forest rf = RandomForestClassifier(n_estimators=1500, max_depth=2, max_features=4) scores_rf = cross_validation.cross_val_score(rf, training_matrix, target, cv=5) print("(Random Forest) Accuracy: %0.5f (+/- %0.2f)" % (scores_rf.mean(), scores_rf.std() * 2)), target) return rf
test_positions = np.take(selected_positions, test_index, axis=0) img_train, img_test = input.train_test_images( train_positions, test_positions) save_rgb(fold_dir + "train.png", img_train, format='png') save_rgb(fold_dir + "test.png", img_test, format='png') if rotation_oversampling: X_train, y_train = input.rotation_oversampling( X_train, y_train) X_train = X_train.reshape(len(X_train), -1) X_test = X_test.reshape(len(X_test), -1) if feature_selection: fs = ExtraTreesClassifier(n_estimators=200) fs =, y_train) model = SelectFromModel(fs, prefit=True) X_train, X_test = model.transform(X_train), model.transform( X_test) print(X_train.shape) else: model = None print("Size training set", len(X_train)) print("Size test set", len(X_test)) if fold_num == 1: file.write("Size training set: %d\n" % len(X_train)) file.write("Size test set: %d\n" % len(X_test)) file.write("Class distribution:\n")
def classify(algorithm, fname, input_data, label_name, n_cores, random_state): train_y = np.array(input_data[label_name]) input_data = input_data.drop('ID', axis=1) training_x = input_data.drop(label_name, axis=1) le = preprocessing.LabelEncoder() train_y = le.transform(train_y) cv_metrics = pd.DataFrame() # 10-fold cross validation predicted_n_actual_pd = pd.DataFrame( columns=['ID', 'predicted', 'actual', 'fold']) kf = KFold(n_splits=10, shuffle=True, random_state=random_state) fold = 1 for train, test in kf.split(training_x): # number of train and test instances is based on training_x. train_cv_features, test_cv_features, train_cv_label, test_cv_label = training_x.iloc[ train], training_x.iloc[test], train_y[train], train_y[test] if algorithm == 'GB': temp_classifier = GradientBoostingClassifier(n_estimators=300, random_state=1) elif (algorithm == 'RF'): temp_classifier = RandomForestClassifier(n_estimators=300, random_state=1, n_jobs=n_cores) elif (algorithm == 'M5P'): temp_classifier = ExtraTreesClassifier(n_estimators=300, random_state=1, n_jobs=n_cores) elif (algorithm == 'KNN'): temp_classifier = KNeighborsClassifier(n_neighbors=3, n_jobs=n_cores) elif (algorithm == 'NEURAL'): temp_classifier = MLPClassifier(random_state=1), train_cv_label) temp_prediction = temp_classifier.predict(test_cv_features) predicted_n_actual_pd = predicted_n_actual_pd.append(pd.DataFrame({ 'ID': test, 'actual': test_cv_label, 'predicted': temp_prediction, 'fold': fold }), ignore_index=True, sort=True) fold += 1 try: roc_auc = round( roc_auc_score(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()), 3) except ValueError: roc_auc = 0.0 matthews = round( matthews_corrcoef(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()), 3) balanced_accuracy = round( balanced_accuracy_score(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()), 3) f1 = round( f1_score(predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()), 3) try: tn, fp, fn, tp = confusion_matrix( predicted_n_actual_pd['actual'].to_list(), predicted_n_actual_pd['predicted'].to_list()).ravel() except: tn, fp, fn, tp = 0, 0, 0, 0 cv_metrics = cv_metrics.append(pd.DataFrame(np.column_stack(['cv',roc_auc, matthews,\ balanced_accuracy, f1, tn, fp, fn, tp]),\ columns=['type','roc_auc','matthew','bacc','f1','TN','FP','FN','TP']), ignore_index=True, sort=True) cv_metrics = cv_metrics.round(3) cv_metrics = cv_metrics.astype({ 'TP': 'int64', 'TN': 'int64', 'FP': 'int64', 'FN': 'int64' }) cv_metrics = cv_metrics[[ 'type', 'matthew', 'f1', 'bacc', 'roc_auc', 'TP', 'TN', 'FP', 'FN' ]] predicted_n_actual_pd['predicted'] = le.inverse_transform( predicted_n_actual_pd['predicted'].to_list()) predicted_n_actual_pd['actual'] = le.inverse_transform( predicted_n_actual_pd['actual'].to_list()) fname_predicted_n_actual_pd = os.path.join( output_result_dir, 'cv_{}_predited_data.csv'.format(algorithm)) predicted_n_actual_pd['ID'] = predicted_n_actual_pd['ID'] + 1 predicted_n_actual_pd = predicted_n_actual_pd.sort_values(by=['ID']) predicted_n_actual_pd.to_csv(fname_predicted_n_actual_pd, index=False) return cv_metrics
def test_importances_asymptotic(): # Check whether variable importances of totally randomized trees # converge towards their theoretical values (See Louppe et al, # Understanding variable importances in forests of randomized trees, 2013). def binomial(k, n): return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True) def entropy(samples): n_samples = len(samples) entropy = 0. for count in np.bincount(samples): p = 1. * count / n_samples if p > 0: entropy -= p * np.log2(p) return entropy def mdi_importance(X_m, X, y): n_samples, n_features = X.shape features = list(range(n_features)) features.pop(X_m) values = [np.unique(X[:, i]) for i in range(n_features)] imp = 0. for k in range(n_features): # Weight of each B of size k coef = 1. / (binomial(k, n_features) * (n_features - k)) # For all B of size k for B in combinations(features, k): # For all values B=b for b in product(*[values[B[j]] for j in range(k)]): mask_b = np.ones(n_samples, dtype=np.bool) for j in range(k): mask_b &= X[:, B[j]] == b[j] X_, y_ = X[mask_b, :], y[mask_b] n_samples_b = len(X_) if n_samples_b > 0: children = [] for xi in values[X_m]: mask_xi = X_[:, X_m] == xi children.append(y_[mask_xi]) imp += (coef * (1. * n_samples_b / n_samples) # P(B=b) * (entropy(y_) - sum([entropy(c) * len(c) / n_samples_b for c in children]))) return imp data = np.array([[0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 1, 1, 1, 0, 1, 2], [1, 0, 1, 1, 0, 1, 1, 3], [0, 1, 1, 1, 0, 1, 0, 4], [1, 1, 0, 1, 0, 1, 1, 5], [1, 1, 0, 1, 1, 1, 1, 6], [1, 0, 1, 0, 0, 1, 0, 7], [1, 1, 1, 1, 1, 1, 1, 8], [1, 1, 1, 1, 0, 1, 1, 9], [1, 1, 1, 0, 1, 1, 1, 0]]) X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7] n_features = X.shape[1] # Compute true importances true_importances = np.zeros(n_features) for i in range(n_features): true_importances[i] = mdi_importance(i, X, y) # Estimate importances with totally randomized trees clf = ExtraTreesClassifier(n_estimators=500, max_features=1, criterion="entropy", random_state=0).fit(X, y) importances = sum(tree.tree_.compute_feature_importances(normalize=False) for tree in clf.estimators_) / clf.n_estimators # Check correctness assert_almost_equal(entropy(y), sum(importances)) assert_less(np.abs(true_importances - importances).mean(), 0.01)
max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) #knn = KNeighborsClassifier(algorithm='brute',n_neighbors=3,metric='mahalanobis') nn = MLPClassifier(alpha=0.0001, hidden_layer_sizes=(500, ), random_state=None, max_iter=500, activation='logistic', solver='adam') grad_boost = GradientBoostingClassifier(n_estimators=500, learning_rate=1) extrat = ExtraTreesClassifier(n_estimators=50, max_depth=None, class_weight='balanced') clf_array = [rf, dtree, nn, svml, extrat, grad_boost] eclf = VotingClassifier(estimators=[('Random Forest', rf), ('Decision Tree', dtree), ('NN', nn), ('GRADIENT', grad_boost), ('EXTRAT', extrat)]) #('NN',nn), for clf_array, label in zip([rf, dtree, svml, nn, grad_boost, extrat, eclf], [ 'Random Forest', 'Decision Tree', 'SVML', 'NN', 'GRADIENT', 'EXTRAT', 'Ensemble' ]): #'NN', scores = cross_val_score(clf_array, training_samples,
# Generate a list of all combinations of categories, up to a max length category_subsets = [] max_classes = 5 for L in range(1, max_classes + 1): for subset in itertools.combinations(categories, L): category_subsets.append(subset) # Now make a look-up table for the index corresponding to a tuple of categories subset_index = {} for i, category_subset in enumerate(category_subsets): subset_index[category_subset] = i if do_train_coarse: # Coarse classifier coarse_classifier = Pipeline([ ('features', CountVectorizer(ngram_range=(1,2))), ('classifier', ExtraTreesClassifier(max_depth=150, random_state=88, n_estimators=200, n_jobs=cpu_count()-1)), ]) # Fit coarse classifier print 'Fitting coarse classifier', train.coarse_label) if do_train_fine: # Fine classifiers fine_classifiers = [] for _ in range(len(category_subsets)): fine_classifier = Pipeline([ ('features', CountVectorizer(ngram_range=(1,2))), ('classifier', ExtraTreesClassifier(max_depth=150, random_state=88*2, n_estimators=400, n_jobs=cpu_count()-1)), ])
from sklearn.model_selection import train_test_split X_train_val, X_test, y_train_val, y_test = train_test_split(,, test_size=10000, random_state=42) X_train, X_val, y_train, y_val = train_test_split( X_train_val, y_train_val, test_size=10000, random_state=42) # Exercise: Then train various classifiers, such as a Random Forest classifier, # an Extra-Trees classifier, and an SVM. from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.svm import LinearSVC from sklearn.neural_network import MLPClassifier random_forest_clf = RandomForestClassifier(random_state=42) extra_trees_clf = ExtraTreesClassifier(random_state=42) svm_clf = LinearSVC(random_state=42) mlp_clf = MLPClassifier(random_state=42) estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf] for estimator in estimators: print("Training the", estimator), y_train) [estimator.score(X_val, y_val) for estimator in estimators] # Out[3]: [0.9467, 0.9512, 0.8327, 0.9592] # The linear SVM is far outperformed by the other classifiers. # However, let's keep it for now since it may improve the voting classifier's performance. # Exercise: Next, try to combine them into an ensemble that outperforms them all on the validation set,
from sklearn import datasets from sklearn import metrics from sklearn.ensemble import ExtraTreesClassifier import pandas as pd from sklearn.cross_validation import train_test_split data = pd.read_csv('Xtrain.csv', sep=',', header=None) dataset = data.values header = dataset[0,1:dataset.shape[1]] dataset = dataset[1:dataset.shape[0],:] ''' Split data into training and testing ''' X = dataset[:,1:dataset.shape[1]] y = dataset[:,0] seed = 7 test_size = 0.33 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed) # fit an Extra Trees model to the data model = ExtraTreesClassifier(), y_train) # display the relative importance of each attribute print(model.feature_importances_)
LogisticRegression(C=0.1, penalty='l2', solver='lbfgs', n_jobs=-1))), ("nb", OneVsRestClassifier(BernoulliNB(alpha=5.0))), ("rf", OneVsRestClassifier( RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, n_jobs=-1))), ("xgb", OneVsRestClassifier( XGBClassifier(n_estimators=150, max_depth=8, n_jobs=8))), ("et", OneVsRestClassifier( ExtraTreesClassifier(n_estimators=300, max_depth=10, min_samples_split=10, n_jobs=-1))), ("ensemble", OneVsRestClassifier(ensemble)), #("svm", SVC(C=100, gamma=0.0001, probability=True)), ] results = {} X_train = feature_extractor.fit_transform(Xr_train, y_train['label_pa']) X_test = feature_extractor.transform(Xr_test) for name, classifier in models: print(name) results[name] = {} cv = StratifiedKFold(n_splits=5, random_state=42)
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = ExtraTreesClassifier(random_state=20, class_weight='balanced', max_features=self.numf, max_depth=1)'Initialised classifier \n') #set up randomized search param_dict = { 'criterion': ['gini', 'entropy'], 'n_estimators': randint(100, 10000), #number of base estimators to use 'min_samples_split': randint(2, 20), 'min_samples_leaf': randint(1, 20), 'max_leaf_nodes': randint(10, 20) } f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(clf1, param_dict, random_state=5,, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted =, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(, '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(, 'best_predictor_' + datestring + '.pkl'))'Writing best classifier to disk in {} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model,, self.bootiter, 'uncalibrated')'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') best_clf_feat_import = self.model.feature_importances_ best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import, self.X_train_scaled.columns), reverse=True) f'Feature importances for best classifier {best_clf_feat_import_sorted} \n' ) all_clf_feat_import_mean = np.mean( [tree.feature_importances_ for tree in self.model.estimators_], axis=0) all_clf_feat_import_mean_sorted = sorted(zip( all_clf_feat_import_mean, self.X_train_scaled.columns), reverse=True) print_to_consol('Plotting feature importances for best classifier') feature_importances_best_estimator(best_clf_feat_import_sorted, f'Plotting feature importances for best classifier in decreasing order \n' ) feature_importances_error_bars(self.model, self.X_train_scaled.columns, f'Plotting feature importances for best classifier with errorbars \n' )
'name': 'Ridge Classifier' }, 'GradientBoostingClassifier': { 'model': GradientBoostingClassifier(max_features=2), 'name': 'Gradient Boost' }, 'SVC': { 'model': SVC(), 'name': 'SVC' }, 'BaggingClassifier': { 'model': BaggingClassifier(), #base_estimator = LinearRegression()), 'name': 'Bagging Classifier' }, 'ExtraTreesClassifier': { 'model': ExtraTreesClassifier(), 'name': 'Extra Trees Classifier' }, 'KNeighborsClassifier': { 'model': KNeighborsClassifier(), 'name': 'K Neighbors Classifier' }, 'DecisionTreeClassifier': { 'model': DecisionTreeClassifier(), 'name': 'Decision Tree Classifier' }, 'AdaBoostClassifier': { 'model': AdaBoostClassifier(), #base_estimator = LinearRegression()), 'name': 'AdaBoost' }, 'LogisticRegression': {
train_x = enc.fit_transform(df) test_y = data_2015['delay'] >= 15 df = data_2015.drop('delay', axis=1) df['carrier'] = pd.factorize(df['carrier'])[0] df['dest'] = pd.factorize(df['dest'])[0] test_x = enc.transform(df) print train_x.shape from sklearn.ensemble import ExtraTreesClassifier # Create Random Forest classifier with 50 trees clf_etc = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0, n_jobs=-1), train_y) # Evaluate on test set pr = clf_etc.predict(test_x.toarray()) # print results cm = confusion_matrix(test_y, pr) print "<------- ExtraTreesClassifier -------->" print "Confusion matrix:" print pd.DataFrame(cm) report_svm = precision_recall_fscore_support(list(test_y), list(pr), average='binary')
columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv") build_audit(DecisionTreeClassifier(random_state=13, min_samples_leaf=2), "DecisionTreeAudit", compact=False) build_audit( BaggingClassifier(DecisionTreeClassifier(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAudit") build_audit(DummyClassifier(strategy="most_frequent"), "DummyAudit") build_audit(ExtraTreesClassifier(random_state=13, min_samples_leaf=5), "ExtraTreesAudit") build_audit( GradientBoostingClassifier(random_state=13, loss="exponential", init=None), "GradientBoostingAudit") build_audit( OptimalLGBMClassifier(objective="binary", n_estimators=37, num_iteration=17), "LGBMAudit") build_audit(LinearDiscriminantAnalysis(solver="lsqr"), "LinearDiscriminantAnalysisAudit") build_audit( LogisticRegression(multi_class="multinomial", solver="newton-cg", max_iter=500), "MultinomialLogisticRegressionAudit") build_audit(LogisticRegressionCV(multi_class="ovr"),
def get_model_from_name(model_name, training_params=None): # For Keras epochs = 250 if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': print( 'Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy' ) epochs = 30 all_model_params = { 'LogisticRegression': { 'n_jobs': -2 }, 'RandomForestClassifier': { 'n_jobs': -2 }, 'ExtraTreesClassifier': { 'n_jobs': -1 }, 'AdaBoostClassifier': { 'n_estimators': 10 }, 'SGDClassifier': { 'n_jobs': -1 }, 'Perceptron': { 'n_jobs': -1 }, 'LinearSVC': { 'dual': False }, 'LinearRegression': { 'n_jobs': -2 }, 'RandomForestRegressor': { 'n_jobs': -2 }, 'LinearSVR': { 'dual': False, 'loss': 'squared_epsilon_insensitive' }, 'ExtraTreesRegressor': { 'n_jobs': -1 }, 'MiniBatchKMeans': { 'n_clusters': 8 }, 'GradientBoostingRegressor': { 'presort': False, 'learning_rate': 0.05, 'warm_start': True }, 'GradientBoostingClassifier': { 'presort': False, 'learning_rate': 0.05, 'warm_start': True }, 'SGDRegressor': { 'shuffle': False }, 'PassiveAggressiveRegressor': { 'shuffle': False }, 'AdaBoostRegressor': { 'n_estimators': 10 }, 'XGBRegressor': { 'nthread': -1, 'n_estimators': 200 }, 'XGBClassifier': { 'nthread': -1, 'n_estimators': 200 }, 'LGBMRegressor': { 'n_estimators': 2000, 'learning_rate': 0.05, 'num_leaves': 8, 'lambda_l2': 0.001 }, 'LGBMClassifier': { 'n_estimators': 2000, 'learning_rate': 0.05, 'num_leaves': 8, 'lambda_l2': 0.001 }, 'DeepLearningRegressor': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'DeepLearningClassifier': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print( 'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:' ) print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'SGDClassifier': SGDClassifier(), 'Perceptron': Perceptron(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), 'SGDRegressor': SGDRegressor(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans() } if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor( calc_feature_importance=True) model_map['CatBoostClassifier'] = CatBoostClassifier( calc_feature_importance=True) if keras_installed: model_map['DeepLearningClassifier'] = KerasClassifier( build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor( build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print( 'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize' ) raise (e) model_with_params = model_without_params.set_params(**model_params) return model_with_params
from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import BaggingClassifier import pickle #%% comment_start = 0 comment_end = 50000 matrix_size = 5000 #%% Diğer sınıflandırma metodlarıda karşılaştırılarak en yüksek başarılı sınıf seçilir. models=[GaussianNB(), RandomForestClassifier(n_estimators=100), KNeighborsClassifier(n_neighbors=5), DecisionTreeClassifier(), SVC(gamma='scale'), GradientBoostingClassifier(), LogisticRegression(multi_class="auto", solver="liblinear"), ExtraTreesClassifier(n_estimators=100), BaggingClassifier()] def best_model(models, show_metrics=False): print("INFO: Finding Accuracy Best Classifier...", end="\n\n") best_clf=None best_acc=0 for clf in models:, y_train) y_pred=clf.predict(x_test) acc=metrics.accuracy_score(y_test, y_pred) print(clf.__class__.__name__, end=" ") print("Accuracy: {:.3f}".format(acc)) if best_acc<acc: best_acc=acc
def compute_score(X_train, y_train, X_test, y_test, cat_indicator, n_jobs, orig_timeout): trees = 100 max_iter = 1000 if len(X_train) >= 100000: trees = 10 max_iter = 100 print("Start!") timeout = orig_timeout start = timer() (X_train, X_test, cat_indicator) = reduce_dimensionality(X_train, X_test, cat_indicator) classifiers = [BernoulliNB(), LinearDiscriminantAnalysis(), LogisticRegression(random_state=1), AdaBoostClassifier(random_state=1), LinearSVC(max_iter=max_iter, random_state=1), ExtraTreesClassifier(random_state=1, n_estimators=trees), RandomForestClassifier(random_state=1, n_estimators=trees), BaggingClassifier(random_state=1, n_estimators=10), MLPClassifier(random_state=1,early_stopping=True), GradientBoostingClassifier(max_features=5, random_state=1, n_estimators=10)] model_steps = [SimpleImputer(strategy='median'), RobustScaler()] ohe = OneHotEncoder(handle_unknown='ignore', sparse=False) cats = [] rows = len(X_train) # use rule of thumb to exclude categorical atts with high cardinality for one-hot-encoding max_num_cols = math.log(rows, 2) if rows > 100000: max_num_cols = max_num_cols/4 # Iterate over all categorical attributes for i in range(len(cat_indicator)): if cat_indicator[i] is True: arity = len(X_train.iloc[:,i].unique()) if arity <= max_num_cols: cats.append(i) if len(cats) > 0: start1=timer() X_train.reset_index(drop=True,inplace=True) X_object = X_train.iloc[:,cats] codes = ohe.fit_transform(X_object) X_train = pd.concat([X_train.drop(X_train.columns[cats],axis=1), pd.DataFrame(codes).astype(int)], axis=1) end1=timer() for m in model_steps: X_train = m.fit_transform(X_train) y_train = pd.DataFrame(y_train) X_train = pd.DataFrame(X_train) num_atts = X_train.shape[1] if num_atts <= 50 and rows <= 10000: classifiers.append(KNeighborsClassifier(n_neighbors=10)) # For ensembles if num_atts >= 500: classifiers[5].max_features="log2" classifiers[6].max_features="log2" classifiers[7].max_features=0.8 classifiers[9].max_features=min(5, num_atts) # For bagging if num_atts < 100: if rows <= 10000: classifiers[7].n_estimators = 100 elif rows <= 50000: classifiers[7].n_estimators = 50 else: classifiers[7].n_estimators = 10 async_message_thread = Pool((int)(n_jobs)) results = [async_message_thread.apply_async(score_solution, (X_train, y_train, c)) for c in classifiers] index = 0 scores = [] end = timer() time_used = end - start timeout = timeout - time_used print("time remaining = ", timeout) for r in results: try: start_solution = timer() score = r.get(timeout=timeout) scores.append(score) end_solution = timer() time_used = end_solution - start_solution timeout = timeout - time_used if timeout <= 0: timeout = 3 except TimeoutError: timeout = 1 except: print(sys.exc_info()[0]) print("Solution terminated: ", classifiers[index]) print(X_train.shape) scores.append(-1) end_solution = timer() time_used = end_solution - start_solution timeout = timeout - time_used if timeout <= 0: timeout = 1 index = index + 1 pca = None RFpca = None print("time remaining = ", timeout) if timeout >= 10 and len(X_train) < 100000: from sklearn.decomposition import PCA start_solution = timer() n_comp = min(10, X_train.shape[1]) pca = PCA(n_components=n_comp) Xpca = pca.fit_transform(X_train) end_solution = timer() time_used = end_solution - start_solution print("PCA = ", time_used) RFpca = RandomForestClassifier(random_state=1) score = score_solution(pd.DataFrame(Xpca), y_train, RFpca) scores.append(score) classifiers.append(RFpca) else: classifiers.append(None) scores.append(-1) timeout = timeout - time_used bagged_trees = classifiers[7].n_estimators while timeout > 0.1 * orig_timeout: trees = trees + 100 print("Trying trees = ", trees) classifiers.append(ExtraTreesClassifier(random_state=1, n_estimators=trees)) classifiers.append(RandomForestClassifier(random_state=1, n_estimators=trees)) bagged_trees = bagged_trees + 10 classifiers.append(BaggingClassifier(random_state=1, max_features=classifiers[7].max_features, n_estimators=bagged_trees)) results = [async_message_thread.apply_async(score_solution, (X_train, y_train, c)) for c in classifiers[10:13]] for r in results: try: start_solution = timer() score = r.get(timeout=timeout) scores.append(score) end_solution = timer() time_used = end_solution - start_solution timeout = timeout - time_used if timeout <= 0: timeout = 1 except TimeoutError: timeout = 1 except: print(sys.exc_info()[0]) print("Solution terminated: ") scores.append(-1) end_solution = timer() time_used = end_solution - start_solution timeout = timeout - time_used if timeout <= 0: timeout = 1 if trees > 1000: break print(scores) # Sort solutions by their scores and rank them sorted_x = np.argsort(scores) best_model = None bestindex = sorted_x[len(scores)-1] if bestindex == 10: # Best is PCA-RF model best_model = RFpca, y_train) model_steps.append(pca) cl = "pca+rf" else: best_model = classifiers[bestindex] print(best_model), y_train) cl = type(best_model).__name__ if len(cats) > 0: # OHE X_test.reset_index(drop=True,inplace=True) X_object = X_test.iloc[:,cats] codes = ohe.transform(X_object) X_test = pd.concat([X_test.drop(X_test.columns[cats],axis=1), pd.DataFrame(codes).astype(int)], axis=1) for m in model_steps: X_test = m.transform(X_test) y_hat = best_model.predict(X_test) best = accuracy_score(y_test, y_hat) #for c in classifiers: #, y_train) # y_hat = c.predict(X_test) # best1 = accuracy_score(y_test, y_hat) # print(c) # print(best1) return (best, len(X_train.columns), cl)
learning_rate=0.75), train_y) model3 = RandomForestClassifier(n_jobs=-1, n_estimators=500, warm_start=True, #'max_features': 0.2, max_depth=6, min_samples_leaf=2, max_features='sqrt', verbose=0 ), train_y) model4 = ExtraTreesClassifier(n_jobs=-1, n_estimators=500, #max_features=0.5, max_depth=8, min_samples_leaf=2, verbose=0), train_y) model5 = SVC(kernel='linear', C=0.025), train_y) train_X1 = model1.predict(train_X) train_X2 = model2.predict(train_X) train_X3 = model3.predict(train_X) train_X4 = model4.predict(train_X) train_X5 = model5.predict(train_X) train_X1 = train_X1[:, np.newaxis] train_X2 = train_X2[:, np.newaxis]
def fun(in_road): # ok start = time.time() index = [] # 获取csv文件里面一共有几列 col_num = get_col.getCol(in_road) data_dimension = col_num - 1 # 载入数据集 dataset = loadtxt(in_road, delimiter=",", skiprows=1) print(type(dataset)) # split data into x and y x = dataset[:, 0:data_dimension] # x[:,m:n],即取所有数据的第m到n-1列数据,含左不含右 y = dataset[:, data_dimension] random_s = [8, 20, 40, 100, 200, 1000] # 依据不同的种子运算多次,之后进行投票选择继续约减 for rs in random_s: # 把数据集拆分成训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=rs) print("-----------------XGBoost-----------------") # 拟合XGBoost模型 model1 = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 树的个数--1000棵树建立xgboost max_depth=5, # 树的深度 min_child_weight=1, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=0.8, # 随机选择80%样本建立决策树 colsample_btree=0.8, # 随机选择80%特征建立决策树 objective='reg:logistic', # 指定损失函数 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27 # 随机数种子 ), y_train) # 强特征排序 importance = model1.feature_importances_ top = pd.Series(importance).sort_values(ascending=False) # 输出前10的index索引 print(list(top.index)[:top_num]) index.extend(list(top.index)[:top_num]) # 对测试集做预测 y_pred = model1.predict(x_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) precision = precision_score(y_test, predictions) print("precision: %.2f%%" % (precision * 100.0)) print("-----------------LightGBM-----------------") params = { 'task': 'train', 'boosting_type': 'gbdt', # GBDT算法为基础 'objective': 'binary', 'metric': 'auc', # 评判指标 'max_bin': 255, # 大会有更准的效果,更慢的速度 'learning_rate': 0.1, # 学习率 'num_leaves': 64, # 大会更准,但可能过拟合 # 'max_depth': -1, 小数据集下限制最大深度可防止过拟合,小于0表示无限制 'feature_fraction': 0.8, # 防止过拟合 'bagging_freq': 5, # 防止过拟合 'bagging_fraction': 0.8, # 防止过拟合 'min_data_in_leaf': 10, # 防止过拟合 'min_sum_hessian_in_leaf': 3.0, # 防止过拟合 # 'header': True 数据集是否带表头 'verbose': -1 # 忽略掉警告:No further splits with positive gain, best gain: -inf } lgb_train = lgb.Dataset(x_train, label=y_train) model2 = lgb.train(params, train_set=lgb_train) importance = model2.feature_importance() top = pd.Series(importance).sort_values(ascending=False) print(list(top.index)[:top_num]) index.extend(list(top.index)[:top_num]) y_pred = model2.predict(x_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) precision = precision_score(y_test, predictions) print("precision: %.2f%%" % (precision * 100.0)) print("-----------------ExtraTree是随机森林的一个变种-----------------") model4 = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0), y_train) importance = model4.feature_importances_ top = pd.Series(importance).sort_values(ascending=False) print(list(top.index)[:top_num]) index.extend(list(top.index)[:top_num]) y_pred = model4.predict(x_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) precision = precision_score(y_test, predictions) print("precision: %.2f%%" % (precision * 100.0)) end = time.time() running_time = end - start print('-----------time--------') print(running_time) print(index) #排序 sort = get_count_by_counter(index) top_index = sort.most_common(top_num) return top_index
import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9270935960591131 exported_pipeline = make_pipeline( StackingEstimator(estimator=LinearSVC( C=0.01, dual=True, loss="hinge", penalty="l2", tol=1e-05)), ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.7500000000000001, min_samples_leaf=5, min_samples_split=2, n_estimators=100)), training_target) results = exported_pipeline.predict(testing_features)
}, 'lr': { 'cv_param': { 'C': [.01, .05, .1, .5, 1.0, 5.0, 10.0], 'penalty': ['l1', 'l2'] }, 'estimator': LogisticRegression(random_state=Repeat * 10 + 2) }, 'et': { 'cv_param': { 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7, None], 'n_estimators': [10, 20, 30, 50, 100] }, 'estimator': ExtraTreesClassifier(n_jobs=-1, random_state=Repeat * 10 + 2) }, 'rf': { 'cv_param': { 'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7, None], 'n_estimators': [10, 20, 30, 50, 100] }, 'estimator': RandomForestClassifier(n_jobs=-1, random_state=Repeat * 10 + 2) } } n_clf = len(CLF) Fscore_trn = np.zeros(n_clf) Fscore_tst = np.zeros(n_clf) prob_trn = np.zeros([n_clf, n_cases_trn, 10])
def classifier(): np.random.seed(0) # seed to shuffle the train set n_folds = 5 verbose = True shuffle = False X, y, X_submission, soln = load() if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] skf = list(StratifiedKFold(y, n_folds)) clfs = [ RandomForestClassifier(n_estimators=10, n_jobs=-1, criterion='gini'), RandomForestClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'), ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='gini'), ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy') ] print "Creating train and test sets for blending." dataset_blend_train = np.zeros((X.shape[0], len(clfs))) dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs))) for j, clf in enumerate(clfs): print j, clf dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) for i, (train, test) in enumerate(skf): print "Fold", i X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] for item in X_train: if len(item) != 1776: print len(item), y_train) y_submission = clf.predict_proba(X_test)[:, 1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1] dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) print len(dataset_blend_test[0]) print "Without Blending" y_submission = dataset_blend_test.mean(1) y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) print "Saving Results." np.savetxt(fname='test_ans.csv', X=y_submission, fmt='%0.9f') print "LogLoss." print logloss(y_submission, soln) print "Blending." clf = LogisticRegression(), y) y_submission = clf.predict_proba(dataset_blend_test)[:, 1] print "Linear stretch of predictions to [0,1]" y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) print "LogLoss." print logloss(y_submission, soln)
# # 传统决策树,随机森林算法 极端随机数的区别 DT = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0) RF = RandomForestClassifier(n_estimators=10, max_features=math.sqrt(n_features), max_depth=None, min_samples_split=2, bootstrap=True) EC = ExtraTreesClassifier(n_estimators=10, max_features=math.sqrt(n_features), max_depth=None, min_samples_split=2, bootstrap=False) # 训练, y_train), y_train), y_train) #区域预测 # 第0列的范围 x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第1列的范围 x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 生成网格采样点行列均为200点 x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j]
tmp_len = len(train[train_series.isnull()]) if tmp_len > 0: #print "mean", train_series.mean() train.loc[train_series.isnull(), train_name] = -999 #and Test tmp_len = len(test[test_series.isnull()]) if tmp_len > 0: test.loc[test_series.isnull(), test_name] = -999 X_train = train X_test = test print('Training...') extc = ExtraTreesClassifier(n_estimators=750, max_features=60, criterion='entropy', min_samples_split=4, max_depth=40, min_samples_leaf=2, n_jobs=-1), target) print('Predict...') y_pred = extc.predict_proba(X_test) #print y_pred pd.DataFrame({ "ID": id_test, "PredictedProb": y_pred[:, 1] }).to_csv('extra_trees.csv', index=False)
def TestPerformance2(X, Y, nF=3, testTimes=10, bScaled=1, _test_size=0.1): if bScaled == 1: X_scaled = preprocessing.scale(X) X = X_scaled print('--------------------START-----------') hitResult0 = [] hitResult1 = [] hitResult2 = [] hitResult3 = [] hitResult4 = [] times = np.zeros(5, ) for iter in range(testTimes): X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=_test_size) starttime = model = IWKNN() w =, y_train) endtime = times[0] = times[0] + (endtime - starttime).seconds print('------------IWKNN------------') HitFeatures(hitResult0, w, nF) # fit an Extra Trees model to the data starttime = model = ExtraTreesClassifier(), Y) endtime = times[1] = times[1] + (endtime - starttime).seconds print('------------ExtraTreesClassifier------------') # display the relative importance of each attribute # print(model.feature_importances_) HitFeatures(hitResult1, model.feature_importances_, nF) starttime = model = LogisticRegression() # create the RFE model and select 3 attributes rfe = RFE(model, nF) rfe =, Y) endtime = times[2] = times[2] + (endtime - starttime).seconds # summarize the selection of the attributes print('------------rfe logistic regression------------') # print(rfe.support_) # print(rfe.ranking_) # print(support2value(rfe.support_)) HitFeatures(hitResult2, support2value(rfe.support_), nF) # print(rfe.scores_) starttime = model = svm.SVC(kernel='linear') # create the RFE model and select 3 attributes rfe = RFE(model, nF) rfe =, Y) endtime = times[3] = times[3] + (endtime - starttime).seconds # summarize the selection of the attributes print('------------rfe svm linear------------') # print(rfe.support_) # print(rfe.ranking_) # print(support2value(rfe.support_)) HitFeatures(hitResult3, support2value(rfe.support_), nF) starttime = ridge = Ridge(alpha=1), Y) endtime = times[4] = times[4] + (endtime - starttime).seconds print('------------ridge------------') # print (ridge.coef_) # print (ridge.intercept_) HitFeatures(hitResult4, ridge.coef_, nF) print('time=') print(times)
"QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), "SupportVectorMachine": SVC(kernel="poly", degree=5), "LogisticRegression": LogisticRegression(solver="saga", n_jobs=-1), "ArtificalNeuralNetwork": MLPClassifier(hidden_layer_sizes=30, max_iter=2000, solver="lbfgs"), "DecisionTree": DecisionTreeClassifier(random_state=42), "ExtraTree": ExtraTreeClassifier(random_state=42), "RandomForest": RandomForestClassifier(n_jobs=-1, random_state=42), "ExtraTrees": ExtraTreesClassifier(n_jobs=-1, random_state=42), "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="error", n_jobs=-1, random_state=42), "LightGBM": LGBMClassifier(n_estimators=128, n_jobs=-1, random_state=42), "AdaBoost": AdaBoostClassifier(n_estimators=128, learning_rate=1.0, random_state=42), "Bagging": BaggingClassifier(n_estimators=128, n_jobs=-1, random_state=42), "GradientBoosting": GradientBoostingClassifier(n_estimators=128, learning_rate=1.0, random_state=42),
num_round, watchlist, obj=logregobj, feval=evalerror) # scikit-learn ExtraTreesClassifier.................................. import gc from time import time from sklearn.pipeline import make_pipeline from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import StratifiedKFold, GridSearchCV from sklearn.model_selection import cross_val_score from sklearn.metrics import roc_auc_score pipe_ext = make_pipeline(ExtraTreesClassifier( random_state=SEED, n_jobs=CPU, )) param_grid_ext = { 'extratreesclassifier__n_estimators': [1000], 'extratreesclassifier__max_depth': [4, 6, 8], 'extratreesclassifier__min_samples_split': [10], 'extratreesclassifier__min_samples_leaf': [10], 'extratreesclassifier__max_features': ['sqrt'], 'extratreesclassifier__n_jobs': [CPU] } gridcv_ext = GridSearchCV(pipe_ext, param_grid=param_grid_ext, scoring='roc_auc', n_jobs=1, cv=StratifiedKFold(n_splits=5, shuffle=True,
# ## Selecting the best Features for our Model # In[42]: x = df4.drop("quality",axis=True) y = df4["quality"] # In[43]: from sklearn.ensemble import ExtraTreesClassifier model = ExtraTreesClassifier(),y) # In[44]: print(model.feature_importances_) # In[45]: feat_importances = pd.Series(model.feature_importances_,index =x.columns) feat_importances.nlargest(9).plot(kind="barh")
class SentimentAnalysis: def readFile(self, filePath): data = [] y = [] with open(filePath, 'r') as file: csvreader = csv.reader(file, delimiter='\t') next(csvreader) for row in csvreader: data.append(row[2]) if len(row) > 3: y.append(row[3]) return data, y def preprocess(self, data): preprocessedCorpus = [] for phrase in data: # All to lower case phrase = phrase.lower() # Split to tokens tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(phrase) # Stopword filtering nonstopTokens = [token for token in tokens if not token in self.stopWords] # Stemming stemmer = SnowballStemmer("english") for index, item in enumerate(nonstopTokens): stemmedWord = stemmer.stem(item) nonstopTokens[index] = stemmedWord # Remove numbers finalTokens = [token for token in nonstopTokens if not token.isnumeric()] # Add to corpus preprocessedCorpus.append(" ".join(nonstopTokens)) return preprocessedCorpus def extractFeatures(self, corpus): wordIds = [] CountVectorizer(binary=binary, tokenizer=lambda x: x.split(), min_df=min_df, ngram_range=(1, 1), stop_words=stopwords), ClassifierOvOAsFeatures() for phrase in corpus: wordIds.append([self.word2id[word] for word in phrase.split(" ")]) return wordIds def classify(self): leafNodeSizeRange = range(1,100) scoreCrossVal = list() for minLeafNodeSize in leafNodeSizeRange: self.classifier = RandomForestClassifier(n_estimators=200, criterion='gini', min_samples_leaf=minLeafNodeSize, n_jobs=-1) scores = cross_val_score(self.classifier, self.X, self.y, cv=10) scoreCrossVal.append(scores.mean()) print(scores.mean()) index, val = max(enumerate(scoreCrossVal), key=operator.itemgetter(1)) print("Max cross validation score: " + str(val)) optimLeafNodeSize = leafNodeSizeRange[index] print("Optimal min leaf node size: " + str(optimLeafNodeSize)) plt.figure() plt.plot(leafNodeSizeRange, scoreCrossVal) plt.xlabel('Minimum samples in leaf node') plt.ylabel('Cross validation score') plt.title('Random Forest') maxDepthRange = range(30, 100, 5) scoreCrossVal = list() for maxTreeDepth in maxDepthRange: self.classifier = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=maxTreeDepth,n_jobs=-1) scores = cross_val_score(self.classifier, self.X, self.y, cv=10) scoreCrossVal.append(scores.mean()) index, val = max(enumerate(scoreCrossVal), key=operator.itemgetter(1)) print("Max cross validation score: " + str(val)) optimTreeDepth = maxDepthRange[index] print("Optimal max tree depth: " + str(optimTreeDepth)) plt.figure() plt.plot(maxDepthRange, scoreCrossVal) plt.xlabel('Maximum tree depth') plt.ylabel('Cross validation score') plt.title('Random Forest') # Try an extremely randomized forest. leafNodeSizeRange = range(1, 100) scoreCrossVal = list() for minLeafNodeSize in leafNodeSizeRange: print("Running model " + str(minLeafNodeSize) + "...") self.classifier = ExtraTreesClassifier(n_estimators=200, criterion='gini', min_samples_leaf=minLeafNodeSize) scores = cross_val_score(self.classifier, self.X, self.y, cv=10) scoreCrossVal.append(scores.mean()) index, val = max(enumerate(scoreCrossVal), key=operator.itemgetter(1)) print("Max cross validation score: " + str(val)) optimLeafNodeSize = leafNodeSizeRange[index] print("Optimal min leaf node size: " + str(optimLeafNodeSize)) plt.figure() plt.plot(leafNodeSizeRange, scoreCrossVal) plt.xlabel('Minimum samples in leaf node') plt.ylabel('Cross validation score') plt.title('Extremely Randomized Forest')
class BaseSkModel(object): """ モデルに関する情報を定義する。モデル名、フォルダパス、目的変数等 """ version_str = 'base' """ モデルのバージョン名 """ model_name = '' """ 学習モデルの名前(XGBoostとか)。init時の引数で定義される """ model_path = "" """ モデルデータが格納される親フォルダ。 """ class_list = ['競走種別コード', '場コード'] """ 分類軸のリスト。このリスト毎に学習モデルを生成 """ obj_column_list = ['WIN_FLAG', 'JIKU_FLAG', 'ANA_FLAG'] """ 説明変数のリスト。このリストの説明変数毎に処理を実施する """ ens_folder_path = "" """ モデルデータが格納される親フォルダ。 """ dict_folder = "" """ 辞書フォルダのパス """ index_list = ["RACE_KEY", "UMABAN", "NENGAPPI"] """ 対象データの主キー。ModeがRaceの場合はRACEにする """ clfs = [ RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), KNeighborsClassifier(n_neighbors=10, n_jobs=-1), GaussianNB(), XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.5, objective='binary:logistic', scale_pos_weight=1, seed=0) ] """ アンサンブル学習時に利用するクラス """ learning_df = "" def __init__(self, model_name, version_str, start_date, end_date, mock_flag, test_flag, mode): self.model_name = model_name self.version_str = version_str self.start_date = start_date self.end_date = end_date self.dict_path = mc.return_base_path(test_flag) self._set_folder_path(mode) self.model_folder = self.model_path + model_name + '/' self.proc = self._get_skproc_object(version_str, start_date, end_date, model_name, mock_flag, test_flag) def _set_folder_path(self, mode): self.model_path = self.dict_path + 'model/' + self.version_str + '/' self.dict_folder = self.dict_path + 'dict/' + self.version_str + '/' self.ens_folder_path = self.dict_path + 'intermediate/' + self.version_str + '_' + mode + '/' def _get_skproc_object(self, version_str, start_date, end_date, model_name, mock_flag, test_flag): print("-- check! this is BaseSkModel class: " + sys._getframe().f_code.co_name) proc = BaseSkProc(version_str, start_date, end_date, model_name, mock_flag, test_flag, self.obj_column_list) return proc def create_learning_data(self): """ 学習用データを作成。処理はprocを呼び出す """ self.learning_df = self.proc.proc_create_learning_data() def get_all_learning_df_for_save(self): save_learning_df = self.learning_df.drop(self.class_list, axis=1) return save_learning_df def get_val_list(self, df, cls_val): val_list = df[cls_val].drop_duplicates().astype(str) return val_list def get_filter_df(self, df, cls_val, val): if cls_val == "コース": query_str = cls_val + " == '" + str(val) + "'" else: query_str = cls_val + " == " + val print(query_str) filter_df = df.query(query_str) # 分類対象のデータを削除 filter_df.drop(self.class_list, axis=1, inplace=True) return filter_df def create_featrue_select_data(self, learning_df): """ 説明変数ごとに特徴量作成の処理(TargetEncodingとか)の処理を実施 :param dataframe learning_df: dataframe """ self.proc.proc_create_featrue_select_data(learning_df) def proc_learning_sk_model(self, df, cls_val, val): """ 説明変数ごとに、指定された場所の学習を行う :param dataframe df: dataframe :param str basho: str """ if not df.dropna().empty: if len(df.index) >= 30: print("----- アンサンブル学習用のクラスをセット -----") self.proc.set_ensemble_params(self.clfs, self.index_list, self.ens_folder_path) print("proc_learning_sk_model: df", df.shape) for target in self.obj_column_list: print(target) self.proc.learning_sk_model(df, cls_val, val, target) else: print("---- 少数レコードのため学習スキップ -- " + str(len(df.index))) else: print("---- NaNデータが含まれているため学習をスキップ") def create_predict_data(self): """ 予測用データを作成。処理はprocを呼び出す """ predict_df = self.proc.proc_create_predict_data() return predict_df def proc_predict_sk_model(self, df, cls_val, val): """ predictする処理をまとめたもの。指定されたbashoのターゲットフラグ事の予測値を作成して連結したものをdataframeとして返す :param dataframe df: dataframe :param str val: str :return: dataframe """ all_df = pd.DataFrame() if not df.empty: for target in self.obj_column_list: pred_df = self.proc._predict_sk_model(df, cls_val, val, target) if not pred_df.empty: grouped_df = pred_df #self._calc_grouped_data(pred_df) grouped_df["target"] = target grouped_df["target_date"] = pred_df[ "NENGAPPI"].dt.strftime('%Y/%m/%d') grouped_df["model_name"] = self.model_name all_df = pd.concat([all_df, grouped_df]).round(3) return all_df def create_import_data(self, all_df): """ データフレームをアンサンブル化(Vote)して格納 """ all_df.dropna(inplace=True) grouped_all_df = all_df.groupby(["RACE_KEY", "UMABAN", "target"], as_index=False).mean() date_df = all_df[["RACE_KEY", "target_date"]].drop_duplicates() temp_grouped_df = pd.merge(grouped_all_df, date_df, on="RACE_KEY") grouped_df = self._calc_grouped_data(temp_grouped_df) import_df = grouped_df[[ "RACE_KEY", "UMABAN", "pred", "prob", "predict_std", "predict_rank", "target", "target_date" ]].round(3) print(import_df) return import_df def eval_pred_data(self, df): """ 予測されたデータの精度をチェック """ check_df = self.proc.create_eval_prd_data(df) for target in self.obj_column_list: print(target) target_df = check_df[check_df["target"] == target] target_df = target_df.query("predict_rank == 1") target_df.loc[:, "的中"] = target_df.apply(lambda x: 1 if x[target] == 1 else 0, axis=1) print(target_df) avg_rate = target_df["的中"].mean() print(round(avg_rate * 100, 1)) def import_data(self, df): print("-- check! this is BaseSkModel class: " + sys._getframe().f_code.co_name) @classmethod def get_recent_day(cls, start_date): print("-- check! this is BaseSkModel class: " + sys._getframe().f_code.co_name) def set_target_date(self, start_date, end_date): """ 学習等データ作成の対象期間をセットする :param str start_date: 開始日(文字列) :param str end_date: 終了日(文字列) """ self.start_date = start_date self.end_date = end_date def set_test_table(self, table_name): """ test用のテーブルをセットする """ self.table_name = table_name def _calc_grouped_data(self, df): """ 与えられたdataframe(予測値)に対して偏差化とランク化を行ったdataframeを返す :param dataframe df: dataframe :return: dataframe """ grouped = df.groupby(["RACE_KEY", "target"]) grouped_df = grouped.describe()['prob'].reset_index() merge_df = pd.merge(df, grouped_df, on=["RACE_KEY", "target"]) merge_df['predict_std'] = ( merge_df['prob'] - merge_df['mean']) / merge_df['std'] * 10 + 50 df['predict_rank'] = grouped['prob'].rank("dense", ascending=False) merge_df = pd.merge( merge_df, df[["RACE_KEY", "UMABAN", "predict_rank", "target"]], on=["RACE_KEY", "UMABAN", "target"]) return_df = merge_df[[ 'RACE_KEY', 'UMABAN', 'pred', 'prob', 'predict_std', 'predict_rank', "target", "target_date" ]] return return_df
y_train = np.loadtxt('y_train_clas.csv', delimiter=',', skiprows=1)[:, 1] X_data_test = np.loadtxt('X_test_clas.csv', delimiter=',', skiprows=1) ''' Optional Hyperparameter tuning: pipeline = make_pipeline(ExtraTreesClassifier()) # Declare hyperparameters to tune hyperparameters = {'extratreesclassifier__random_state': range(0,50,1), 'extratreesclassifier__n_estimators' : range(60,70,1), 'extratreesclassifier__max_features' : [None, 'sqrt', 'log2'], 'extratreesclassifier__max_depth' : [None, 4, 5, 6, 7, 8, 10]} # Tune model using cross-validation #clextr = RandomizedSearchCV(pipeline, hyperparameters, n_iter=1000) ''' ## fitting the model clextr = ExtraTreesClassifier(random_state=22) # Fit the model for the data, y_train) y_predict = clextr.predict(X_data_test) # store data into the csv file test_header = "Id,EpiOrStroma" n_points = X_data_test.shape[0] y_predict_pp = np.ones((n_points, 2)) y_predict_pp[:, 0] = range(n_points) y_predict_pp[:, 1] = y_predict np.savetxt('clas_et_submission.csv', y_predict_pp, fmt='%d', delimiter=",", header=test_header,
feature_set_test.append(feature_extraction(Xtest[i][j])) feature_sets_train = np.array(feature_set_train) feature_sets_test = np.array(feature_set_test) print("Loading Feature Set Matrix...") print("FeatureSet Train: ", feature_sets_train.shape) print("FeatureSet Test: ", feature_sets_test.shape) # In[8]: ytrain = ytrain.reshape(-1, ) ytest = ytest.reshape(-1, ) # print ("ytrain Reshaped!") # In[9]: Emodel = ExtraTreesClassifier(n_estimators=150, random_state=5047), ytrain) # In[10]: t1 = time() pred = Emodel.predict(feature_sets_test[0].reshape(1, -1)) print("Running the Classifier, Sony Dependent mode... ") print("Predicted Label: ", pred[0]) t2 = time() print("Time taken per prediction (in sec): ", t2 - t1) # In[ ]: