print(le_sex.classes_) titanic_train['Sex'] = le_sex.transform(titanic_train['Sex']) le_pclass = preprocessing.LabelEncoder() le_pclass.fit(titanic_train['Pclass']) print(le_pclass.classes_) titanic_train['Pclass'] = le_pclass.transform(titanic_train['Pclass']) features = ['Pclass', 'Parch' , 'SibSp', 'Age', 'Fare', 'Embarked', 'Sex'] X_train = titanic_train[features] y_train = titanic_train['Survived'] dt_estimator = tree.DecisionTreeClassifier() ada_estimator = ensemble.AdaBoostClassifier(base_estimator=dt_estimator) ada_grid = {'n_estimators':[10, 50, 100, 200], 'base_estimator__max_depth':[3,4,5,6,7], 'learning_rate':[0.1, 0.5, 1] } ada_grid_estimator = model_selection.GridSearchCV(ada_estimator, ada_grid, cv=10, return_train_score=True) ada_grid_estimator.fit(X_train, y_train) print(ada_grid_estimator.best_score_) print(ada_grid_estimator.best_params_) final_estimator = ada_grid_estimator.best_estimator_ final_estimator.score(X_train, y_train) print(final_estimator.estimators_) # try Adaboost for Knn model #read test data titanic_test = pd.read_csv("Titanic_test.csv") print(titanic_test.info()) titanic_test[imputable_cont_features] = cont_imputer.transform(titanic_test[imputable_cont_features])
train_updtd, test_updtd = fileSplit(all_houses_onehot, train.shape[0]) y_train=train_updtd['SalePrice'] filterFeatures(train_updtd, ['SalePrice','log_sale_price']) X_train=train_updtd X_train.info() def rmse(y_orig, y_pred): return math.sqrt(metrics.mean_squared_error(y_orig,y_pred)) gbm_estimator = ensemble.GradientBoostingRegressor(random_state=2017) gbm_grid = {'learning_rate':[0.1,0.3,0.5,0.7,0.9], 'n_estimators':[50,100], 'max_features':[12,13,14,15,16,17,18,19,20]} grid_gbm_estimator = model_selection.GridSearchCV(gbm_estimator,gbm_grid,scoring=metrics.make_scorer(rmse),cv=10,n_jobs=1) grid_gbm_estimator.fit(X_train, y_train) print(grid_gbm_estimator.grid_scores_) print(grid_gbm_estimator.best_params_) print(grid_gbm_estimator.best_score_) print(grid_gbm_estimator.score(X_train, y_train)) estimator = grid_gbm_estimator.best_estimator_ ##################Final Prections Preparation #total_missing_test = test_updtd.isnull().sum() #n_test = test.shape[0] #to_delete_test = total_missing_test[(total_missing_test/n_test) > 0 ] #missingDataFeaturestes_test = list(to_delete_test.index) #test_updtd.info()
axis=1, inplace=False) #See how may columns are there after 3 additional columns, one hot encoding and dropping titanic2.shape X_train = titanic2[0:titanic_train.shape[0]] X_train.shape X_train.info() y_train = titanic_train['Survived'] #Let's build the model #If we don't use random_state parameter, system can pick different values each time and we may get slight difference in accuracy each time you run. tree_estimator = tree.DecisionTreeClassifier(random_state=2017) dt_grid = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(3, 10))} grid_tree_estimator = model_selection.GridSearchCV(tree_estimator, dt_grid, cv=10) grid_tree_estimator.fit(X_train, y_train) print(grid_tree_estimator.best_score_) #Best score print(grid_tree_estimator.best_params_) print(grid_tree_estimator.score(X_train, y_train)) dt_grid2 = {'criterion': ['gini', 'entropy'], 'max_depth': list(range(6, 10))} grid_tree_estimator2 = model_selection.GridSearchCV(tree_estimator, dt_grid2, cv=8) grid_tree_estimator2.fit(X_train, y_train) print(grid_tree_estimator2.best_score_) #Best score print(grid_tree_estimator2.best_params_) print(grid_tree_estimator2.score(X_train, y_train))
ohe.fit(titanic_train[ohe_features]) print(ohe.n_values_) tmp1 = ohe.transform(titanic_train[ohe_features]).toarray() features = ['Age', 'Fare', 'Parch', 'SibSp', 'FamilySize'] tmp2 = titanic_train[features].values X_train = np.concatenate((tmp1, tmp2), axis=1) y_train = titanic_train['Survived'] #create an estimator dt_estimator = tree.DecisionTreeClassifier(random_state=100) dt_grid = {'max_depth': [3, 4, 5, 6, 7], 'criterion': ['entropy', 'gini']} dt_grid_estimator = model_selection.GridSearchCV(dt_estimator, dt_grid, scoring='accuracy', cv=10, refit=True) dt_grid_estimator.fit(X_train, y_train) #explore the results of grid_search_cv estimator print(dt_grid_estimator.cv_results_) print(dt_grid_estimator.best_estimator_) print(dt_grid_estimator.best_score_) print(dt_grid_estimator.best_params_) #visualuze the final model built with best parameters in grid best_dt_estimator = dt_grid_estimator.best_estimator_ print(best_dt_estimator.score(X_train, y_train)) #read test data
'nthread': [-1] } }) # param_grids.update({ # 'mlp': # {'solver':['lbfgs'], 'alpha':[1e-5], 'hidden_layer_sizes':[(15,)], 'random_state':[1] } # }) # 完成超参数网格搜索后的模型 model_grids = {} kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7) for name, param in param_grids.items(): model_grids[name] = model_selection.GridSearchCV(models[name], param, n_jobs=-1, cv=kfold, verbose=1, scoring='f1') # model_grids[name] = models[name] def read_data(): data1 = pd.read_csv(os.path.join(data_root, '基础数据.csv'), encoding='GB2312') data2 = pd.read_csv(os.path.join(data_root, '年数据.csv'), encoding='GB2312') # print(data2) # reader3 = pd.read_table( # os.path.join(data_root, '日数据.csv'), # encoding='GB2312', # sep=',', # iterator=True) # chunks = []
voting_estimator = ensemble.VotingClassifier(estimators=[('dt', dt_estimator), ('rf', rf_estimator), ('ada', ada_estimator) ], voting='soft', weights=[4, 4, 5]) voting_grid = { 'dt__max_depth': [3, 5, 7], 'rf__n_estimators': [20, 30], 'rf__max_features': [7, 8], 'rf__max_depth': [7, 8, 9], 'ada__n_estimators': [50] } voting_grid_estimator = model_selection.GridSearchCV(voting_estimator, voting_grid, cv=10, n_jobs=5) voting_grid_estimator.fit(X_train, y_train) print(voting_grid_estimator.grid_scores_) print(voting_grid_estimator.best_score_) print(voting_grid_estimator.best_params_) print(voting_grid_estimator.score(X_train, y_train)) x_test = train3[train.shape[0]:] x_test.shape test['type'] = voting_grid_estimator.predict(x_test) test.to_csv("Submission.csv", columns=['id', 'type'], index=False)
#svm.SVC(kernel='rbf', gamma=0.7), #svm.SVC(kernel='poly', degree=3) ] param_grid = [ { 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS #'classify__gamma': gamma_range } ] reducer_labels = ['PCA', 'NMF'] classifier_labels = ['SVClinear'] #classifier_labels = ['SVClinear', 'SVCrbf', 'SVCpoly'] #classifier_labels = ['SVClinear', 'LinearSVC', 'SVCrbf', 'SVCpoly'] grid = model_selection.GridSearchCV(cached_pipe, cv=5, n_jobs=1, param_grid=param_grid) grid.fit(X_train, y_train) joblib.dump(grid, 'grid.set3.pkl') grid = joblib.load('grid.set3.pkl') y_predictions = grid.predict(X_test) report = metrics.classification_report( y_test, y_predictions ) print(report) print("Best parameters set found on development set:") print() print(grid.best_params_) print("With a Best Score of:")
# I use a Power Tuned Random Forest Classifier where the tuned parameter is 'n_estimators'. # For making this, I will create a vector "x" for trying different parameters in 'n_estimators'. # In[ ]: x = [] for i in range(100): x.append(i + 1) # I set the model and the parameters's model to optimize with GridSearchCV # In[ ]: model = RandomForestClassifier(oob_score=True) parameters = {'n_estimators': x} power_tuning = model_selection.GridSearchCV(model, parameters) model.fit(data_train, label_train) # In[ ]: model_tuned = power_tuning.fit(data_train, label_train.Survived) # In[ ]: model_tuned.best_estimator_ # In[ ]: print("The best parameter for 'n_estimator' is:", model_tuned.best_estimator_.n_estimators)
os.chdir("E:/") titanic_train = pd.read_csv("train.csv") #EDA titanic_train.shape titanic_train.info() titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'Sex', 'Embarked']) titanic_train1.shape titanic_train1.info() titanic_train1.head(6) X_train = titanic_train1.drop(['PassengerId','Age','Cabin','Ticket', 'Name','Survived'], 1) y_train = titanic_train['Survived'] #automate model tuning process. use grid search method dt = tree.DecisionTreeClassifier() param_grid = {'criterion':['entropy'],'max_depth':[3,4,5,6,7,8,9,10], 'min_samples_split':[7,8,9,10,11,12]} dt_grid = model_selection.GridSearchCV(dt, param_grid, cv=10, n_jobs=5) dt_grid.fit(X_train, y_train) dt_grid.grid_scores_ final_model = dt_grid.best_estimator_ dt_grid.best_score_ dt_grid.score(X_train, y_train) dot_data = io.StringIO() tree.export_graphviz(final_model, out_file = dot_data, feature_names = X_train.columns) graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] graph.write_pdf("decisiont-tree-tuned1.pdf")
def basic_results(clf, classes, training_x, training_y, test_x, test_y, params, clf_type=None, dataset=None, dataset_readable_name=None, balanced_dataset=False, best_params=None, seed=55, threads=1): logger.info("Computing basic results for {} ({} thread(s))".format( clf_type, threads)) if clf_type is None or dataset is None: raise Exception('clf_type and dataset are required') if seed is not None: np.random.seed(seed) curr_scorer = scorer if not balanced_dataset: curr_scorer = f1_scorer if best_params: clf.fit(training_x, training_y) test_score = clf.score(test_x, test_y) cv = clf else: cv = ms.GridSearchCV(clf, n_jobs=threads, param_grid=params, refit=True, verbose=10, cv=5, scoring=curr_scorer) cv.fit(training_x, training_y) reg_table = pd.DataFrame(cv.cv_results_) reg_table.to_csv('{}/{}_{}_reg.csv'.format(OUTPUT_DIRECTORY, clf_type, dataset), index=False) test_score = cv.score(test_x, test_y) # TODO: Ensure this is an estimator that can handle this? best_estimator = cv.best_estimator_.fit(training_x, training_y) final_estimator = best_estimator._final_estimator grid_best_params = pd.DataFrame([final_estimator.get_params()]) grid_best_params.to_csv('{}/{}_{}_best_params.csv'.format( OUTPUT_DIRECTORY, clf_type, dataset), index=False) logger.info(" - Grid search complete") final_estimator.write_visualization('{}/images/{}_{}_LC'.format( OUTPUT_DIRECTORY, clf_type, dataset)) test_y_predicted = cv.predict(test_x) cnf_matrix = confusion_matrix(test_y, test_y_predicted) np.set_printoptions(precision=2) plt = plot_confusion_matrix(cnf_matrix, classes, title='Confusion Matrix: {} - {}'.format( clf_type, dataset_readable_name)) plt.savefig('{}/images/{}_{}_CM.png'.format(OUTPUT_DIRECTORY, clf_type, dataset), format='png', dpi=150, bbox_inches='tight') plt = plot_confusion_matrix( cnf_matrix, classes, normalize=True, title='Normalized Confusion Matrix: {} - {}'.format( clf_type, dataset_readable_name)) plt.savefig('{}/images/{}_{}_NCM.png'.format(OUTPUT_DIRECTORY, clf_type, dataset), format='png', dpi=150, bbox_inches='tight') logger.info(" - Visualization complete") with open('{}/test results.csv'.format(OUTPUT_DIRECTORY), 'a') as f: ts = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') f.write('"{}",{},{},{},"{}"\n'.format(ts, clf_type, dataset, test_score, cv.best_params_)) n = training_y.shape[0] train_sizes = np.append(np.linspace(0.05, 0.1, 20, endpoint=False), np.linspace(0.1, 1, 20, endpoint=True)) logger.info(" - n: {}, train_sizes: {}".format(n, train_sizes)) train_sizes, train_scores, test_scores = ms.learning_curve( clf if best_params is not None else cv.best_estimator_, training_x, training_y, cv=5, train_sizes=train_sizes, verbose=10, scoring=curr_scorer, n_jobs=threads, random_state=seed) logger.info(" - n: {}, train_sizes: {}".format(n, train_sizes)) curve_train_scores = pd.DataFrame(index=train_sizes, data=train_scores) curve_test_scores = pd.DataFrame(index=train_sizes, data=test_scores) curve_train_scores.to_csv('{}/{}_{}_LC_train.csv'.format( OUTPUT_DIRECTORY, clf_type, dataset)) curve_test_scores.to_csv('{}/{}_{}_LC_test.csv'.format( OUTPUT_DIRECTORY, clf_type, dataset)) plt = plot_learning_curve( 'Learning Curve: {} - {}'.format(clf_type, dataset_readable_name), train_sizes, train_scores, test_scores) plt.savefig('{}/images/{}_{}_LC.png'.format(OUTPUT_DIRECTORY, clf_type, dataset), format='png', dpi=150) logger.info(" - Learning curve complete") return cv
ddf1.sample(10) X=ddf1[["SEX","p1","p2","p3","agecat"]] y=ddf1["dpay"] #####grid search X.info() y.info() depth=list(range(3,11)) grid = {'max_depth': np.arange(3, 10), 'criterion' : ['gini','entropy'], 'max_leaf_nodes': [5,10,20,100], 'min_samples_split': [2, 5, 10, 20]} model=tree.DecisionTreeClassifier() gmodel=model_selection.GridSearchCV(model,grid,scoring="recall") gmodel.fit(X,y) best = gmodel.best_estimator_ best.fit(X,y) print(dict(zip(X.columns, best.feature_importances_))) X=ddf1[["SEX","edu","mar","agecat","p1","p2","p3","p4","p5","p6"]] Y=ddf1["dpay"] model =linear_model.LogisticRegression() rfe = feature_selection.RFE(best,15) fit = rfe.fit(X, y) print("Num Features: %d",fit.n_features_)
def iteration_lc(clf, training_x, training_y, test_x, test_y, params, clf_type=None, dataset=None, dataset_readable_name=None, balanced_dataset=False, x_scale='linear', seed=55, threads=1): logger.info( "Building iteration learning curve for params {} ({} threads)".format( params, threads)) if clf_type is None or dataset is None: raise Exception('clf_type and dataset are required') if seed is not None: np.random.seed(seed) curr_scorer = scorer acc_method = balanced_accuracy if not balanced_dataset: curr_scorer = f1_scorer acc_method = f1_accuracy cv = ms.GridSearchCV(clf, n_jobs=threads, param_grid=params, refit=True, verbose=10, cv=5, scoring=curr_scorer) cv.fit(training_x, training_y) reg_table = pd.DataFrame(cv.cv_results_) reg_table.to_csv('{}/ITER_base_{}_{}.csv'.format(OUTPUT_DIRECTORY, clf_type, dataset), index=False) d = defaultdict(list) name = list(params.keys())[0] for value in list(params.values())[0]: d['param_{}'.format(name)].append(value) clf.set_params(**{name: value}) clf.fit(training_x, training_y) pred = clf.predict(training_x) d['train acc'].append(acc_method(training_y, pred)) clf.fit(training_x, training_y) pred = clf.predict(test_x) d['test acc'].append(acc_method(test_y, pred)) logger.info(' - {}'.format(value)) d = pd.DataFrame(d) d.to_csv('{}/ITERtestSET_{}_{}.csv'.format(OUTPUT_DIRECTORY, clf_type, dataset), index=False) plt = plot_learning_curve('{} - {} ({})'.format(clf_type, dataset_readable_name, name), d['param_{}'.format(name)], d['train acc'], d['test acc'], multiple_runs=False, x_scale=x_scale, x_label='Value') plt.savefig('{}/images/{}_{}_ITER_LC.png'.format(OUTPUT_DIRECTORY, clf_type, dataset), format='png', dpi=150) logger.info(" - Iteration learning curve complete") return cv
'feature': X_train.columns, 'importance': rf.feature_importances_ }) features.sort_values(by=['importance'], ascending=True, inplace=True) features.set_index('feature', inplace=True) #How to display plot: #Go to Menu Bar-->Tools-->Preferences-->IPython Console-->Graphics Tab-->Graphics backend-->Change Backend to Automatic features.plot(kind='barh', figsize=(30, 30)) #threshold means, how many number of features to be selected #prefit means, the modeal was alredy fit X_train.shape fs = feature_selection.SelectFromModel(rf, threshold='median', prefit=True) X_train1 = fs.transform(X_train) X_train1.shape type(X_train1) #build model using selected features bagged_tree_estimator = ensemble.RandomForestClassifier(random_state=100, oob_score=True) bagged_tree_grid = {'n_estimators': list(range(10, 11, 10))} grid_bagged_tree_estimator = model_selection.GridSearchCV( bagged_tree_estimator, bagged_tree_grid, cv=10) grid_bagged_tree_estimator.fit(X_train1, y_train) final_model = grid_bagged_tree_estimator.best_estimator_ #print(final_model.oob_score_) print(grid_bagged_tree_estimator.best_score_) print(grid_bagged_tree_estimator.score(X_train1, y_train)) #May be over-fitted model
def basic_results(clf, classes, training_x, training_y, test_x, test_y, params, clf_type=None, dataset=None, dataset_readable_name=None, seed=55, threads=1): print("Computing basic results ({} thread(s))".format(threads)) if clf_type is None or dataset is None: raise Exception('clf_type and dataset are required') if seed is not None: np.random.seed(seed) cv = ms.GridSearchCV(clf, n_jobs=threads, param_grid=params, refit=True, verbose=10, cv=5, scoring=scorer) cv.fit(training_x, training_y) reg_table = pd.DataFrame(cv.cv_results_) reg_table.to_csv('{}/{}_{}_reg.csv'.format(OUTPUT_DIRECTORY, clf_type, dataset), index=False) test_score = cv.score(test_x, test_y) # TODO: Ensure this is an estimator that can handle this? best_estimator = cv.best_estimator_.fit(training_x, training_y) final_estimator = best_estimator._final_estimator best_params = pd.DataFrame([final_estimator.get_params()]) best_params.to_csv('{}/{}_{}_best_params.csv'.format( OUTPUT_DIRECTORY, clf_type, dataset), index=False) print(" - Grid search complete") final_estimator.write_visualization('{}/images/{}_{}_LC'.format( OUTPUT_DIRECTORY, clf_type, dataset)) test_y_predicted = cv.predict(test_x) cnf_matrix = confusion_matrix(test_y, test_y_predicted) np.set_printoptions(precision=2) plt = plot_confusion_matrix(cnf_matrix, classes, title='Confusion Matrix: {} - {}'.format( clf_type, dataset_readable_name)) plt.savefig('{}/images/{}_{}_CM.png'.format(OUTPUT_DIRECTORY, clf_type, dataset), format='png', dpi=150, bbox_inches='tight') plt = plot_confusion_matrix( cnf_matrix, classes, normalize=True, title='Normalized Confusion Matrix: {} - {}'.format( clf_type, dataset_readable_name)) plt.savefig('{}/images/{}_{}_NCM.png'.format(OUTPUT_DIRECTORY, clf_type, dataset), format='png', dpi=150, bbox_inches='tight') print(" - Visualization complete") with open('{}/test results.csv'.format(OUTPUT_DIRECTORY), 'a') as f: f.write('{},{},{},{}\n'.format(clf_type, dataset, test_score, cv.best_params_)) n = training_y.shape[0] # TODO: Is the range here dependent on the dataset? train_sizes = list( map( int, list( np.geomspace(max(n * 0.05, 50), n * 0.79, num=20, endpoint=True)))) train_sizes, train_scores, test_scores = ms.learning_curve( cv.best_estimator_, training_x, training_y, cv=5, train_sizes=train_sizes, verbose=10, scoring=scorer, n_jobs=threads) curve_train_scores = pd.DataFrame(index=train_sizes, data=train_scores) curve_test_scores = pd.DataFrame(index=train_sizes, data=test_scores) curve_train_scores.to_csv('{}/{}_{}_LC_train.csv'.format( OUTPUT_DIRECTORY, clf_type, dataset)) curve_test_scores.to_csv('{}/{}_{}_LC_test.csv'.format( OUTPUT_DIRECTORY, clf_type, dataset)) plt = plot_learning_curve( 'Learning Curve: {} - {}'.format(clf_type, dataset_readable_name), train_sizes, train_scores, test_scores) plt.savefig('{}/images/{}_{}_LC.png'.format(OUTPUT_DIRECTORY, clf_type, dataset), format='png', dpi=150) print(" - Learning curve complete") return cv
event = spec.flatten() data.append(event) target.append(datafile[key].attrs['label']) data = np.array(data) target = np.array(target) X_train, X_test, y_train, y_test = model_selection.train_test_split( data, target, test_size=.3) begin = time.time() print('Training...') spectrogram = int(sys.argv[3]) if grid_search: param_grid = {'C': [.1, 1, 10, 100], 'gamma': [1, .1, .01, .0001]} grid = model_selection.GridSearchCV(svm.SVC(), param_grid, refit=True, cv=5, iid=False) grid.fit(X_train, y_train) print(grid.best_estimator_) else: clf = svm.SVC(kernel='linear') clf.fit(X_train, y_train) end = time.time() print('Training completed in {}'.format(end - begin)) if grid_search: y_pred = grid.predict(X_test) else: y_pred = clf.predict(X_test)
# initiate knn nn = neural_network.MLPClassifier(random_state=123) # save the parametre features to tune as a dictionary params = { 'hidden_layer_sizes': [(10, ), (20, ), (30, ), (40, ), (50, )], 'activation': ['logistic', 'tanh', 'relu'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'solver': ['lbfgs', 'sgd', 'adam'], 'max_iter': [500] } # initate the tuning procedure, optimise on accuracy tunenn = model_selection.GridSearchCV(estimator=nn, param_grid=params, scoring='accuracy') # tune the model tunenn.fit(X_train, y_train) # extract the best score tunenn.best_score_ # extract the best estimator tunenn.best_estimator_ # extract the best parameters tunenn.best_params_ # explicitly intiate the tuned model
ycols = ['class'] xcols = list(set(df.columns) - set(ycols)) X = df.loc[:, xcols].values y = np.ravel(df.loc[:, ycols].values) # specify cross-validation k = 10 cvsplitter = sm.KFold(n_splits=k, shuffle=True, random_state=0) # array of hyperparameter values to test alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0]) param_grid = {'alpha': alphas} # create the model and execute grid search model = sl.Ridge() search = sm.GridSearchCV(estimator=model, param_grid=param_grid) search.fit(X, y) print('Grid search score of best hyperparameter value: {0:.4f}'.format( search.best_score_)) print('Grid search best hyperparameter value: {0:.4f}'.format( search.best_estimator_.alpha)) print('Grid search best hyperparameter values:') for tpl in search.best_params_.items(): print(' {0:<10}: {1:.6f}'.format(tpl[0], tpl[1])) print('') # use randomized search by selecting hyperparameter values randomly from a # uniform distribution 100 times param_grid = {'alpha': stats.uniform()} search = sm.RandomizedSearchCV(estimator=model, param_distributions=param_grid,
def model(X, Y): # MLA = [ # # ensemble Model # ensemble.AdaBoostRegressor(), # ensemble.GradientBoostingRegressor(), # ensemble.ExtraTreesRegressor(), # # #GLM # linear_model.SGDRegressor(), # # #SVM # svm.NuSVR(), # svm.SVR(), # # #xgboost # XGBRegressor() # ] # data splite split = model_selection.ShuffleSplit(n_splits=10, random_state=0) grid_n_estimator = [50, 100, 300, 500, 800, 1000] grid_ratio = [.1, .25, .5, .75, 1.0] grid_learn = [.01, .03, .05, .1, .25] grid_max_depth = [2, 4, 6, 8, 10, None] #grid_criterion = ['gini', 'entropy'] grid_seed = [0] n_jobs = 2 vote_est = [ #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html ('ada', ensemble.AdaBoostRegressor()), ('bc', ensemble.BaggingRegressor(n_jobs=n_jobs)), ('gbc', ensemble.GradientBoostingRegressor()), ('rfc', ensemble.RandomForestRegressor(n_jobs=n_jobs)), ('etc', ensemble.ExtraTreesRegressor(n_jobs=n_jobs)), #SVM: http://scikit-learn.org/stable/modules/svm.html ('svc', svm.SVR()), #MLPRegressor ('mlp', neural_network.MLPRegressor(hidden_layer_sizes=( 50, 100, 20, ), max_iter=1000)), #xgboost: http://xgboost.readthedocs.io/en/latest/model.html ('xgb', XGBRegressor()) ] grid_param = [ [{ #AdaBoostRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html 'n_estimators': grid_n_estimator, #default=50 'learning_rate': grid_learn, #default=1 #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R 'random_state': grid_seed }], [{ #BaggingRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier 'n_estimators': grid_n_estimator, #default=10 'max_samples': grid_ratio, #default=1.0 'random_state': grid_seed }], [{ #GradientBoostingRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier #'loss': ['deviance', 'exponential'], #default=’deviance’ 'learning_rate': [ .05 ], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds. 'n_estimators': [ 300 ], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds. #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse” 'max_depth': grid_max_depth, #default=3 'random_state': grid_seed }], [{ #RandomForestRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier 'n_estimators': grid_n_estimator, #default=10 #'criterion': grid_criterion, #default=”gini” 'max_depth': grid_max_depth, #default=None 'oob_score': [ True ], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds. 'random_state': grid_seed }], [{ #ExtraTreesRegressor - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier 'n_estimators': grid_n_estimator, #default=10 #'criterion': grid_criterion, #default=”gini” 'max_depth': grid_max_depth, #default=None 'random_state': grid_seed }], [{ #SVR - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 2, 3, 4, 5], #default=1.0 'gamma': grid_ratio, #edfault: auto }], [{ #MLP regressor 'activation': ['logistic', 'relu'], 'random_state': grid_seed, }], [{ #XGBRegressor - http://xgboost.readthedocs.io/en/latest/parameter.html 'learning_rate': grid_learn, #default: .3 'max_depth': [1, 2, 4, 6, 8, 10], #default 2 'n_estimators': grid_n_estimator, 'seed': grid_seed }] ] for rlf, param in zip(vote_est, grid_param): best_search = model_selection.GridSearchCV( estimator=rlf[1], param_grid=param, cv=split, scoring='neg_mean_squared_error', n_jobs=n_jobs) best_search.fit(X, Y) best_param = best_search.best_params_ bestIndex = best_search.best_index_ trainBestScore = best_search.cv_results_['mean_train_score'][bestIndex] testBestScore = best_search.best_score_ print('The best parameter for {} is {}.'.format( rlf[1].__class__.__name__, best_param)) print('The train best score {:.3f}'.format(trainBestScore)) print('The test best score {:.3f}'.format(testBestScore)) rlf[1].set_params(**best_param) return vote_est
], axis=1, inplace=True) #split titanic data as train and test X_train = titanic_all1.iloc[0:891] X_train.shape X_train.info() y_train = titanic_all['Survived'].iloc[0:891] parameter_grid = dict(n_estimators=[300, 400], criterion=['gini', 'entropy'], max_features=[3, 4, 5, 6, 7, 8]) rf_estimator = ensemble.RandomForestClassifier(random_state=100) rf_grid_estimator = model_selection.GridSearchCV(estimator=rf_estimator, param_grid=parameter_grid, cv=10, verbose=1, n_jobs=10, refit=True) rf_grid_estimator.fit(X_train, y_train) rf_grid_estimator.grid_scores_ X_test = titanic_all1.iloc[891:1309] #number of features in test data mismatches with train data titanic_test['Survived'] = rf_grid_estimator.predict(X_test) titanic_test['Survived'] = titanic_test['Survived'].map(lambda x: int(x)) titanic_test.to_csv("submission.csv", columns=['PassengerId', 'Survived'], index=False)
def get_top_n_features(X, Y, top_n_features, col): # randomforest rf_est = RandomForestClassifier(random_state=0) rf_param_grid = { 'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20] } rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1, scoring="recall") rf_grid.fit(X, Y) print('Top N Features Best RF Params:' + str(rf_grid.best_params_)) print('Top N Features Best RF Score:' + str(rf_grid.best_score_)) print('Top N Features RF Train Score:' + str(rf_grid.score(X, Y))) feature_imp_sorted_rf = pd.DataFrame({ 'feature': col, 'importance': rf_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) rf_1 = feature_imp_sorted_rf[:10] rf_2 = 100 * feature_imp_sorted_rf[:10]['importance'] #print(rf_2) features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature'] print('Sample 10 Feeatures from RF Classifier') print(str(features_top_n_rf[:10])) ''' pos = np.arange(rf_2.shape[0]) + 0.5 plt.figure(1, figsize = (18, 8)) plt.subplot(121) plt.barh(pos, rf_1['importance'][::-1]) plt.yticks(pos, rf_1['feature'][::-1]) plt.xlabel('Relative Importance') plt.title('RandomForest Feature Importance') ''' # AdaBoost ada_est = AdaBoostClassifier(random_state=0) ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]} ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1, scoring="recall") ada_grid.fit(X, Y) print('Top N Features Best Ada Params:' + str(ada_grid.best_params_)) print('Top N Features Best Ada Score:' + str(ada_grid.best_score_)) print('Top N Features Ada Train Score:' + str(ada_grid.score(X, Y))) feature_imp_sorted_ada = pd.DataFrame({ 'feature': col, 'importance': ada_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) ''' ada_1=feature_imp_sorted_ada[:10] ada_2= 100*feature_imp_sorted_ada[:10]['importance'] #plt.figure(1, figsize = (18, 8)) plt.subplot(122) plt.barh(pos, ada_1['importance'][::-1]) plt.yticks(pos, ada_1['feature'][::-1]) plt.xlabel('Relative Importance') plt.title('Adaboost Feature Importance') plt.show()''' features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature'] print('Sample 10 Features from Ada Classifier:') print(str(features_top_n_ada[:10])) # ExtraTree et_est = ExtraTreesClassifier(random_state=0) et_param_grid = { 'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [20] } et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1, scoring="recall") et_grid.fit(X, Y) print('Top N Features Best ET Params:' + str(et_grid.best_params_)) print('Top N Features Best DT Score:' + str(et_grid.best_score_)) print('Top N Features ET Train Score:' + str(et_grid.score(X, Y))) feature_imp_sorted_et = pd.DataFrame({ 'feature': col, 'importance': et_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) ''' et_1=feature_imp_sorted_et[:10] et_2= 100*feature_imp_sorted_et[:10]['importance'] plt.figure(1, figsize = (18, 8)) pos = np.arange(et_2.shape[0]) + 0.5 plt.subplot(121) plt.barh(pos, et_1['importance'][::-1]) plt.yticks(pos, et_1['feature'][::-1]) plt.xlabel('Relative Importance') plt.title('ExtraTrees Feature Importance') ''' features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature'] print('Sample 10 Features from ET Classifier:') print(str(features_top_n_et[:10])) # GradientBoosting gb_est = GradientBoostingClassifier(random_state=0) gb_param_grid = { 'n_estimators': [500], 'learning_rate': [0.01, 0.1], 'max_depth': [20] } gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid, n_jobs=25, cv=10, verbose=1, scoring="recall") gb_grid.fit(X, Y) print('Top N Features Best GB Params:' + str(gb_grid.best_params_)) print('Top N Features Best GB Score:' + str(gb_grid.best_score_)) print('Top N Features GB Train Score:' + str(gb_grid.score(X, Y))) feature_imp_sorted_gb = pd.DataFrame({ 'feature': col, 'importance': gb_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) ''' gb_1=feature_imp_sorted_gb[:10] gb_2= 100*feature_imp_sorted_gb[:10]['importance'] #plt.figure(1, figsize = (18, 8)) plt.subplot(122) pos = np.arange(gb_2.shape[0]) + 0.5 plt.barh(pos, et_1['importance'][::-1]) plt.yticks(pos, et_1['feature'][::-1]) plt.xlabel('Relative Importance') plt.title('GradientBoosting Feature Importance') plt.show()''' features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature'] print('Sample 10 Feature from GB Classifier:') print(str(features_top_n_gb[:10])) # DecisionTree dt_est = DecisionTreeClassifier(random_state=0) dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]} dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=25, cv=10, verbose=1, scoring="recall") dt_grid.fit(X, Y) print('Top N Features Bset DT Params:' + str(dt_grid.best_params_)) print('Top N Features Best DT Score:' + str(dt_grid.best_score_)) print('Top N Features DT Train Score:' + str(dt_grid.score(X, Y))) feature_imp_sorted_dt = pd.DataFrame({ 'feature': col, 'importance': dt_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) dt_1 = feature_imp_sorted_gb[:10] dt_2 = 100 * feature_imp_sorted_gb[:10]['importance'] plt.figure(1, figsize=(18, 8)) pos = np.arange(dt_2.shape[0]) + 0.5 #plt.subplot(121) plt.barh(pos, dt_1['importance'][::-1]) plt.yticks(pos, dt_1['feature'][::-1]) plt.xlabel('Relative Importance') plt.title('DecisionTree Feature Importance') features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature'] print('Sample 10 Features from DT Classifier:') print(str(features_top_n_dt[:10])) # merge the three models features_top_n = pd.concat([ features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt ], ignore_index=True).drop_duplicates() features_importance = pd.concat([ feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et, feature_imp_sorted_gb, feature_imp_sorted_dt ], ignore_index=True) plt.show() return features_top_n, features_importance
# svd = TruncatedSVD(n_components=500, random_state=42) # x_train = svd.fit_transform(x_train) # x_test = svd.transform(x_test) ## Naive Bayes classifier clf_NB = MultinomialNB() pipeline = Pipeline([('tfidf', tfidf_vect), ('nb', clf_NB)]) parameters = { 'tfidf__ngram_range': ((1, 1), (1, 2)), 'tfidf__max_df': (0.1, 0.5, 0.6, 1.0), 'nb__alpha': (0.01, 0.1, 0.6, 1.0), } grid = model_selection.GridSearchCV(pipeline, parameters) clf = grid clf.fit(x_train, y_train) y_hat = clf.predict(x_test) print("grid search params for %s: " % type(clf_NB).__name__) best_parameters = grid.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print "\t%s: %r" % (param_name, best_parameters[param_name]) print "NB accuracy: ", accuracy_score(y_test, y_hat) print "NB recall: ", recall_score(y_test, y_hat, average='weighted') print "NB precision: ", precision_score(y_test, y_hat, average='weighted') print "Labels:" print labels
) #Initialize with preprocess variations to test for key in model_params.keys(): if model_name in key: filtered_params[key] = model_params[key] cv_splitter = model_selection.KFold(n_splits=folds, random_state=FIX_RAND_STATE) scorer = metrics.make_scorer( metrics.mean_squared_error) #, average = score_avg performance_metric_name = scorer.__str__().rstrip(')').split('(')[ 1] # Extract scorer metric name gs = model_selection.GridSearchCV(estimator=pipe, param_grid=filtered_params, cv=cv_splitter, n_jobs=-1, scoring=scorer) print(filtered_params) gs.fit(X_train, np.log1p(y_train)) scores[model_name] = None scores[model_name] = {'best_score': gs.best_score_} print("Best {}: {:.4f} with params: {}: ".format(performance_metric_name, gs.best_score_, gs.best_params_)) pipe.steps.pop() #Pop model in turn from pipe # Store best model results to plot i.e those corresponding to best model params cv_results_df = pd.DataFrame(gs.cv_results_)
titanic_train1.head(6) #feature engineering X_train = titanic_train1.drop( ['PassengerId', 'Cabin', 'Ticket', 'Name', 'Survived'], 1) y_train = titanic_train['Survived'] #build the decision tree model dt = tree.DecisionTreeClassifier(criterion='entropy') dt_grid = { 'criterion': ['gini', 'entropy'], 'max_depth': list(range(3, 15)), 'min_samples_split': [2, 3, 6, 7, 8] } param_grid = model_selection.GridSearchCV(dt, dt_grid, cv=10) #Evolution of tee param_grid.fit(X_train, y_train) #Building the tree print(param_grid.best_score_) #Best score print(param_grid.best_params_) print(param_grid.score(X_train, y_train)) #train score #Evolution of tree #use cross validation to estimate performance of model. #============================================================================== # cv_scores = model_selection. (dt, X_train, y_train, cv=5, verbose=3) # cv_scores.mean() #============================================================================== #build final model on entire train data which is us for prediction #dt.fit(X_train,y_train) # natively deploy decision tree model(pickle format)
X_test = scaler.transform(X_test) ''' Modelling. Train, validation separation Neural network Confusion matrix (target is accuracy) Error/accuracy plots ''' # Split the train and the validation set for the fitting. X_train, X_val, y_train, y_val = ms.train_test_split(X_train, y_train, test_size=0.2) # Tuning model hyperparameters. param_grid = {"alpha": [0.001, 0.003, 0.01]} nn_clf = MLPClassifier(hidden_layer_sizes=(200,200,200), max_iter=200) nn_clf = ms.GridSearchCV(estimator=nn_clf, param_grid=param_grid, cv=4, scoring='accuracy') nn_clf.fit(np.concatenate((X_train, X_val)), np.concatenate((y_train, y_val))) print('Optimum hyperparameters are,') print(nn_clf.best_params_) # Fit the neural net model. Used GridSearchCV to find the optimum alpha. nn_mod = MLPClassifier(hidden_layer_sizes=(500,500,500), alpha=nn_clf.best_params_['alpha'], max_iter=400) nn_mod.fit(X_train, y_train) val_scores = nn_mod.predict(X_val) # Compute confusion matrix. conf_mtx = sklm.confusion_matrix(y_val, val_scores) # Plot the confusion matrix. plot_confusion_matrix(conf_mtx, classes = nn_mod.classes_)
tmp1 = ohe.transform(titanic_train[categorical_feature]).toarray() tmp1 = pd.DataFrame(tmp1) continuous_features = ['Fare', 'Age', 'SibSp', 'Parch'] tmp2 = titanic_train[continuous_features] tmp = pd.concat([tmp1, tmp2], axis=1) scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(tmp) y_train = titanic_train['Survived'] knn_estimator = neighbors.KNeighborsClassifier() Knn_grid = { 'n_neighbors': [5, 7, 8, 10, 20, 25, 30], 'weights': ['uniform', 'distance'] } knn_grid_estimator = model_selection.GridSearchCV(knn_estimator, Knn_grid, cv=10, return_train_score='True') knn_grid_estimator.fit(X_train, y_train) print(knn_grid_estimator.best_estimator_) print(knn_grid_estimator.best_score_) print(knn_grid_estimator.best_params_) results = knn_grid_estimator.cv_results_ final_estimator = knn_grid_estimator.best_estimator_ print(final_estimator.score(X_train, y_train)) #read test data titanic_test = pd.read_csv( 'C:\\Users\\tauseef.ur.rahman\\Desktop\\Python-Docs\\Titanic\\test.csv') titanic_test[imputable_cont_features] = cont_imputer.transform( titanic_test[imputable_cont_features]) titanic_test['Embarked'] = cat_imputer.transform(titanic_test['Embarked'])
#scale all the columns with z-scores mapper = DataFrameMapper([(titanic_all1.columns, preprocessing.StandardScaler())]) scaled_features = mapper.fit_transform(titanic_all1) type(scaled_features) titanic_all2 = pd.DataFrame(scaled_features, columns=titanic_all1.columns) pca = decomposition.PCA(45) pca.fit(titanic_all2) print(pca.explained_variance_) print(pca.explained_variance_ratio_) print(pca.explained_variance_ratio_.cumsum()) titanic_all3 = pd.DataFrame(pca.transform(titanic_all2)) #split titanic data as train and test X_train = titanic_all3.iloc[0:891] X_train.shape X_train.info() y_train = titanic_all['Survived'].iloc[0:891] parameter_grid = dict(n_neighbors=[3, 4, 5, 6, 7], weights=['uniform', 'distance']) knn_estimator = neighbors.KNeighborsClassifier() knn_grid_estimator = model_selection.GridSearchCV(estimator=knn_estimator, param_grid=parameter_grid, cv=10, verbose=1, n_jobs=10) knn_grid_estimator.fit(X_train, y_train) knn_grid_estimator.grid_scores_
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features): # random forest rf_est = RandomForestClassifier(random_state=0) rf_param_grid = { 'n_estimators': [100], 'min_samples_split': [2, 3], 'max_depth': [20] } rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1) rf_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best RF Params:' + str(rf_grid.best_params_)) print('Top N Features Best RF Score:' + str(rf_grid.best_score_)) print('Top N Features RF Train Score:' + str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_rf = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': rf_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_rf = feature_imp_sorted_rf.head( top_n_features)['feature'][:top_n_features] print('Sample 10 Features from RF Classifier') print(str(features_top_n_rf[:top_n_features])) # AdaBoost ada_est = AdaBoostClassifier(random_state=0) ada_param_grid = {'n_estimators': [100], 'learning_rate': [0.01, 0.1]} ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1) ada_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best Ada Params:' + str(ada_grid.best_params_)) print('Top N Features Best Ada Score:' + str(ada_grid.best_score_)) print('Top N Features Ada Train Score:' + str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_ada = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': ada_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_ada = feature_imp_sorted_ada.head( top_n_features)['feature'][:top_n_features] print('Sample 10 Feature from Ada Classifier:') print(str(features_top_n_ada[:top_n_features])) # ExtraTree et_est = ExtraTreesClassifier(random_state=0) et_param_grid = { 'n_estimators': [100], 'min_samples_split': [3, 4], 'max_depth': [20] } et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1) et_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best ET Params:' + str(et_grid.best_params_)) print('Top N Features Best ET Score:' + str(et_grid.best_score_)) print('Top N Features ET Train Score:' + str(et_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_et = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': et_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_et = feature_imp_sorted_et.head( top_n_features)['feature'][:top_n_features] print('Sample 10 Features from ET Classifier:') print(str(features_top_n_et[:top_n_features])) # GradientBoosting gb_est = GradientBoostingClassifier(random_state=0) gb_param_grid = { 'n_estimators': [100], 'learning_rate': [0.01, 0.1], 'max_depth': [20] } gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid, n_jobs=25, cv=10, verbose=1) gb_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best GB Params:' + str(gb_grid.best_params_)) print('Top N Features Best GB Score:' + str(gb_grid.best_score_)) print('Top N Features GB Train Score:' + str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_gb = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': gb_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_gb = feature_imp_sorted_gb.head( top_n_features)['feature'][:top_n_features] print('Sample 10 Feature from GB Classifier:') print(str(features_top_n_gb[:top_n_features])) # DecisionTree dt_est = DecisionTreeClassifier(random_state=0) dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]} dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=25, cv=10, verbose=1) dt_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best DT Params:' + str(dt_grid.best_params_)) print('Top N Features Best DT Score:' + str(dt_grid.best_score_)) print('Top N Features DT Train Score:' + str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_dt = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': dt_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_dt = feature_imp_sorted_dt.head( top_n_features)['feature'][:top_n_features] print('Sample 10 Features from DT Classifier:') print(str(features_top_n_dt[:top_n_features])) # merge the three models features_top_n = pd.concat([ features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt ], ignore_index=True).drop_duplicates() features_importance = pd.concat([ feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et, feature_imp_sorted_gb, feature_imp_sorted_dt ], ignore_index=True) return features_top_n, features_importance
def run(): df = buildDf('qqq') #df = buildDf('atvi') #This drops the first column which is an extra index df = df.drop(columns=df.columns[0]) df['seq'] = df.index import seaborn as sns fig, ax = plt.subplots() sns.regplot(x='seq', y='Close', data=df, lowess=True) #print(df.tail(50).to_string()) df['Volatility'] = (df['Close'] - df['Open']) / df['Volume'] fig, ax = plt.subplots() sns.heatmap(df.corr(), cmap='Blues') #I slice off the first 50 since they are Nan #start at 50 for MAcross as MA50 col doesnt start counting until 50 #The max is len(df)-1 because to calculate updown it reads the next date, since the next one at the end is null this avoids the error #X are the features I want #Y is the target UpDown which tries to predict based on previous close to close if the next day will go up or down X = df[[ 'Open', 'High', 'Low', 'Close', 'Volume', 'MA10', 'MA50', 'Volatility', 'CrossUpCrossDown' ]][50:len(df) - 1].values y = df[['UpDown']][50:len(df) - 1].values.astype(int).ravel() #This sets the training and tests automatically and makes sure features and targets are distributed well. X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.1) print("y_test:" + str(collections.Counter(y_test))) print("y_train" + str(collections.Counter(y_test))) ###Feature selection print("Feature selection LDA") selection = RFECV(LinearDiscriminantAnalysis(), scoring='accuracy') selection.fit_transform(X_train, y_train) print(selection.support_) print("Feature selection CART") selection = RFECV(LinearDiscriminantAnalysis(), scoring='accuracy') selection.fit_transform(X_train, y_train) print(selection.support_) #I make a list of models to try out, setting the seed so they are they same when ran models = [] models.append(('LR', LogisticRegression())) models.append(('RF', RandomForestClassifier())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) #I commented out my classifier because it takes like 15 min to run. feel free to try it out though. I didnt set the random seed so it will #differ slightly from the KNN but if it is set, it will be the same #models.append(('AdricsKnn',AdricsKNNClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) #models.append(('SVM', SVC())) parameters = { 'n_components': [None, 1, 2], 'n_neighbors': [6, 7, 8, 9], 'K': [6, 7, 8], 'C': [.0001, .001, .01, .1, 1, 10, 100], 'max_depth': [3, 4, 5, 6, 7, 8], 'gamma': [.0001, .001, .01, .1] } results = [] names = [] #this makes sure the distribution is balanced and sets up the results for name, model in models: param_grid = {} #this takes the keys and checks to make sure it doesnt pass an incorrect one to a model. for k in parameters.keys(): if k in model.get_params().keys(): param_grid[k] = parameters[k] #I do the grid search here to find the best parameters based from the parameters above gs = model_selection.GridSearchCV(model, param_grid, cv=5, scoring='accuracy') gs.fit(X_train, y_train) #This gives the results after cross validating so that I can plot them on graph cv_results = model_selection.cross_val_score(gs, X_train, y_train, cv=5, scoring='accuracy') names.append(name) msg = "Model:\n%s \n%s: %f (%f)" % ( gs.best_estimator_, name, cv_results.mean(), cv_results.std()) results.append(cv_results) print(msg) #This shows and compares results on a graph fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show() #This is one model and testing to see the results so that they can be compared and predicts with clean data print("LinearDiscriminantAnalysis") model = LinearDiscriminantAnalysis(n_components=None) model.fit(X_train, y_train) predicted = model.predict(X_test) print("PREDICTION LDR IS:::::::::::::::::::::::::\n") print(predicted) print("LDR actual:::::::::::::::::::") print(y_test.reshape((1, len(y_test)))) print(collections.Counter(y_test)) error = np.mean(predicted != y_test) print("Accuracy: " + str(1 - error)) print("Cart") model = DecisionTreeClassifier(max_depth=3) model.fit(X_train, y_train) predicted = model.predict(X_test) print("PREDICTION LDR IS:::::::::::::::::::::::::\n") print(predicted) print("LDR actual:::::::::::::::::::") print(y_test.reshape((1, len(y_test)))) print(collections.Counter(y_test)) error = np.mean(predicted != y_test) print("Accuracy: " + str(1 - error)) # # prints tuples lined up # print(list(zip(a[175:190],b[175:190]))) neuro1(X_train, X_test, y_train, y_test)
delimiter=',') #gram_test = np.loadtxt('D:/Study/Bioinformatics/补实验/QSP/kernels/K_test_'+name+'.csv', delimiter = ',') y_train = np.loadtxt( 'D:/Study/Bioinformatics/补实验/QSP/features/train_label.csv', delimiter=',') #y_test = np.loadtxt('D:/Study/Bioinformatics/补实验/QSP/features/test_label.csv', delimiter = ',') cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0) parameters = {'C': np.logspace(-15, 10, base=2, num=52)} grid = model_selection.GridSearchCV(svm.SVC(kernel='precomputed', probability=True), parameters, n_jobs=-1, cv=cv, verbose=2) grid.fit(gram_train, y_train) C = grid.best_params_['C'] print('C =', C) clf = svm.SVC(C=C, kernel='precomputed', probability=True) scorerMCC = metrics.make_scorer(metrics.matthews_corrcoef) scorerSP = metrics.make_scorer(specificity_score) scorerPR = metrics.make_scorer(metrics.precision_score) scorerSE = metrics.make_scorer(metrics.recall_score) scorer = { 'ACC': 'accuracy',
X_train = titanic2[0:titanic_train.shape[0]] X_train.shape X_train.info() y_train = titanic_train['Survived'] #oob scrore is computed as part of model construction process dt_estimator = tree.DecisionTreeClassifier() ada_estimator = ensemble.AdaBoostClassifier(base_estimator=dt_estimator, random_state=2017) ada_grid = { 'n_estimators': [50], 'learning_rate': [0.01, 0.02, 1.0], 'base_estimator__max_depth': [3] } grid_ada_estimator = model_selection.GridSearchCV(ada_estimator, ada_grid, cv=10, n_jobs=1) grid_ada_estimator.fit(X_train, y_train) print(grid_ada_estimator.grid_scores_) print(grid_ada_estimator.best_score_) print(grid_ada_estimator.best_params_) print(grid_ada_estimator.score(X_train, y_train)) #exlore feature importances calculated by decision tree algorithm features = X_train.columns importances = grid_ada_estimator.best_estimator_.feature_importances_ fe_df = pd.DataFrame({'feature': features, 'importance': importances}) X_test = titanic2[titanic_train.shape[0]:] X_test.shape X_test.info()