def score_model(dataset): train, test, targets = recover_train_test_target(dataset) randomForestClassifier = RandomForestClassifier(n_estimators=50, max_features='sqrt') randomForestClassifier = randomForestClassifier.fit(train, targets) DataExploration.show_variable_relation_with_survival(train, randomForestClassifier) model = SelectFromModel(randomForestClassifier, prefit=True) train_reduced = model.transform(train) test_reduced = model.transform(test) parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6} model = RandomForestClassifier(**parameters) model.fit(train, targets) print 'Score: ', compute_score(model, train, targets, scoring='accuracy') output = model.predict(test).astype(int) df_output = pd.DataFrame() aux = pd.read_csv('data/test.csv') df_output['PassengerId'] = aux['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('data/solution.csv', index=False)
class randForestClassifier(AbstractModelClass): def __init__(self,n_estimators): self.n_estimators=n_estimators self.create_model() def create_model(self): self.classifier = SelectFromModel(RandomForestClassifier(self.n_estimators)) def fit(self,data,labels): self.classifier.fit(data,labels) def classifier_prediction(self,x_test): return self.classifier.predict(x_test) def get_support(self): return self.classifier.get_support() def get_SelectedFeatures(self, feature_data ): return feature_data.columns[self.get_support()] def get_FeatureImportance(self): feature_ranked = self.classifier.estimator_.feature_importances_ ranked_feature_indices = np.argsort(feature_ranked)[::-1] return ranked_feature_indices , feature_ranked
def randomForest(train, test, targets): clf = RandomForestClassifier(n_estimators=50, max_features='sqrt') clf = clf.fit(train, targets) model = SelectFromModel(clf, prefit=True) run_gs = False if run_gs: parameter_grid = { 'max_depth': [4, 6, 8], 'n_estimators': [50, 10], 'max_features': ['sqrt', 'auto', 'log2'], 'min_samples_split': [1, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], } forest = RandomForestClassifier() cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation) grid_search.fit(train, targets) model = grid_search parameters = grid_search.best_params_ print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) else: parameters = { 'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6 } model = RandomForestClassifier(**parameters) model.fit(train, targets) output = model.predict(test).astype(int) df_output = pd.DataFrame() aux = pd.read_csv('../input/test.csv') df_output['PassengerId'] = aux['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('../predition.csv', index=False)
y_test_pred = [] y_pred = [] for col in select_models: test_preds = [] preds = [] for model in models[col]: test_preds.append(model.predict(x_test)) preds.append(model.predict(x_pred)) test_pred = np.mean(test_preds, axis=0) pred = np.mean(preds, axis=0) y_test_pred.append(test_pred) y_pred.append(pred) selection_model.fit(select_x_train, y_train[:, i]) y_pred = selection_model.predict(select_x_test) r2 = r2_score(y_test[:, i], y_pred) mae = MAE(y_test[:, i], y_pred) print(selection_model.best_params_) if mae <= best_mae: print("예아~") best_mae = mae best_model = selection_model best_y_pred = selection_model.predict(select_x_pred) best_y_test_pred = y_pred print("Thresh=%.3f, n=%d, MAE: %.5f R2: %.2f%%" % (thresh, select_x_train.shape[1], mae, r2 * 100)) final_y_pred.append(best_y_pred) final_y_test_pred.append(best_y_test_pred) y_test_pred = []
plt.show() # In[ ]: start = time.clock() model = XGBClassifier(booster='gbtree', max_depth=5, eval_metric='auc', learning_rate=0.7, min_child_weight=0.9, verbose_eval=True) model.fit(DataFrame(X_train, dtype='float'), DataFrame(y_train)) end = time.clock() print('训练模型的时间为' + str(end - start)) start = time.clock() y_pred = model.predict(DataFrame(X_test, dtype='float')) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print(classification_report(y_test, predictions)) end = time.clock() print('预测163个结果,所需时间为' + str(end - start)) # In[ ]: print(model) # In[ ]: xgb.to_graphviz(model, num_trees=10)
class Predictor: def __init__(self): self._regressor = None self._model = None self._X_train = None self._X_test = None self._y_train = None self._y_test = None data = self._read_data() self._create_features(data) self._create_regressor() self._train_model() def simplify(self, test_data): test_data = test_data.loc[:, test_data.columns.isin(self. _filtered_features)] test_data = test_data.reindex(self._filtered_features, axis=1) self._X_test = self._X_test.reindex(self._filtered_features, axis=1) parameters = { 'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6 } self._model = RandomForestRegressor(**parameters) self._model.fit(self._X_train, self._y_train) return test_data def predict(self, test_data, output_datatype): return self._model.predict(test_data).astype(output_datatype) def _read_data(self): if path.exists(DATASET_NAME): data = pd.read_pickle(DATASET_NAME) return differential_vector(data) frames = [pd.read_pickle(match) for match in \ glob('matches/*/*')] data = pd.concat(frames) data.drop_duplicates(inplace=True) data = filter_stats(data) data = data.dropna() data['home_free_throw_percentage'].fillna(0, inplace=True) data['away_free_throw_percentage'].fillna(0, inplace=True) data['points_difference'] = data['home_points'] - data['away_points'] return differential_vector(data) def _create_features(self, data): X = data.drop('points_difference', 1) y = data['points_difference'] split_data = train_test_split(X, y) self._X_train, self._X_test, self._y_train, self._y_test = split_data def _create_regressor(self): reg = RandomForestRegressor(n_estimators=50, max_features='sqrt') self._regressor = reg.fit(self._X_train, self._y_train) def _train_model(self): train = self._X_train self._model = SelectFromModel(self._regressor, prefit=True, threshold=0.01) self._X_train = self._model.transform(self._X_train) new_columns = train.columns[self._model.get_support()] self._filtered_features = [str(col) for col in new_columns]
#Tree-based feature selection ################# C5- Random Forrest from sklearn.feature_selection import SelectFromModel #use select from model to select those features which importance is greater than the mean importance of all the features by default rf_sel = RandomForestClassifier(n_estimators = 20).fit(x_train[selected_feat], y_train) #save model joblib.dump(rf_sel, 'q2c_rf_fea_sel.pkl') #load model back into spyder rf_sel = joblib.load('q2c_rf_fea_sel.pkl') # 4 - create prediction and convert it into dataframe df_pred = pd.DataFrame(rf_sel.predict(x_test[selected_feat]),columns = ['prediction']) #5 merge predction back to test dataset final_df = pd.merge(adult_test,df_pred,how = 'left',left_index = True, right_index = True) ##Export data as CSV final_df.to_csv('q2c_rf_fea_sel.csv', index=False) #### Q2d - ensemble method ## voting from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression
for alg in algorithms: # Fit the algorithm using the full training data. alg.fit(train, targets) # Predict using the test dataset. We have to convert all the columns to floats to avoid an error. predictions = alg.predict_proba(test.astype(float))[:,1] full_predictions.append(predictions) predictions = (full_predictions[0] + full_predictions[1]*2 + full_predictions[2]) / 4 models = [logreg_cv, rf, gboost] for model in models: print('Cross-validation of : {0}'.format(model.__class__)) score = compute_score(clf=model, X=train_reduced, y=targets, scoring='accuracy') print ('CV score = {0}'.format(score)) print('****') rf.fit(train, targets) """ predictions = model.predict(test[predictors]) predictions[predictions > 0.5] = 1 predictions[predictions <= 0.5] = 0 predictions = predictions.astype(int) submission = pd.DataFrame({ "PassengerId": titanic_test["PassengerId"], "Survived": predictions }) print(submission) submission.to_csv('/Users/martin_yan/Desktop/submission3.csv', index=False) """ # 用到的特征 # 1.线性回归 # alg = LinearRegression() # 2.逻辑回归 # alg= LogisticRegression(random_state=1)
def predictWithFeatureSelectionNNConst(X, y, topN, size, learning_rate, n_iter): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=0, shuffle=True) clf = ExtraTreesClassifier(n_estimators=100) clf = clf.fit(X_train, y_train) print('Feature Importances') print(clf.feature_importances_) for feature in zip(X.columns, clf.feature_importances_): print(feature) feature_importance_normalized = np.std( [tree.feature_importances_ for tree in clf.estimators_], axis=0) XFeatures = list() model = SelectFromModel(clf, prefit=True, threshold=-np.inf, max_features=topN) X_train = model.transform(X_train) X_test = model.transform(X_test) print(model.get_support(indices=True)) for feature_list_index in model.get_support(indices=True): XFeatures.append(X.columns[feature_list_index]) print('Selected Features') print(XFeatures) sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.fit_transform(X_test) model = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=learning_rate, max_iter=n_iter, shuffle=True, random_state=None, verbose=True, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10) model.fit(X_train, y_train) joblib.dump(model, 'dataset/mlp_class.jbl') stat = list() # Evaluate on training data print('\n-- Training data --') predictions = model.predict(X_train) accuracy = metrics.accuracy_score(y_train, predictions) print('Accuracy: {0:.2f}'.format(accuracy * 100.0)) print('Classification Report:') print(metrics.classification_report(y_train, predictions)) print('Confusion Matrix:') print(metrics.confusion_matrix(y_train, predictions)) print('') stat.append(round(accuracy * 100.0, 2)) # Evaluate on test data print('\n---- Test data ----') predictions = model.predict(X_test) accuracy = metrics.accuracy_score(y_test, predictions) print('Accuracy: {0:.2f}'.format(accuracy * 100.0)) print('Classification Report:') print(metrics.classification_report(y_test, predictions)) print('Confusion Matrix:') print(metrics.confusion_matrix(y_test, predictions)) stat.append(round(accuracy * 100.0, 2)) stat.append(learning_rate) plt.plot(model.loss_curve_) plt.xlabel('Epoch') plt.ylabel('Value') plt.title('Model loss for relu activation and solver adam') plt.show() return stat
modeler = [ DecisionTreeClassifier(), RandomForestClassifier(), GradientBoostingClassifier() ] S_X_train, S_X_test = stacking(modeler, X_train, y_train, X_test, regression=False, metric=metrics.log_loss, needs_proba=True, stratified=True, shuffle=True, random_state=42, verbose=2) # %% model = LogisticRegression(penalty='l1', C=1, random_state=42) model = model.fit(S_X_train, y_train) y_pred = pd.Series(model.predict(S_X_test)) y_pred_proba = model.predict_proba(S_X_test)[:, 1] print("R Square:", metrics.accuracy_score(y_test, model.predict(S_X_test))) print("kappa:", metrics.cohen_kappa_score(y_test, model.predict(S_X_test))) # %%
if type_col[c]==1: print "Catagorical var %s selected \n "%ki p+=1 break c+=1 print "attributes cat--",p print list_already_taken X_train, X_test, y_train, y_test = train_test_split(X_new_df, Y, test_size=0.3, random_state=0) print X_train.shape print X_test.shape print y_train.shape print y_test.shape regr = linear_model.LinearRegression() model = regr.fit(X_train, y_train) Y_res = model.predict(X_test) print("Mean squared error: %.2f"% mean_squared_error(y_test, Y_res)) print('Variance score: %.2f' % r2_score(y_test, Y_res)) print "END"
x_train, x_test = x[train_index], x[test_index] y_train, y_test = y_b[train_index], y_b[test_index] start = time.time() lasso = Lasso(alpha=alpha[idx]).fit(x_train, y_train) model = SelectFromModel(lasso, prefit=True) x_train = model.transform(x_train) x_test = model.transform(x_test) #create NN model model = build_model() model.fit(x_train, y_train, epochs=10, batch_size=100, verbose=1) prediction = model.predict(x_test) end = time.time() save_data[f"{classifier_condition}_fold_{z+1}_n={alpha[idx]}"] = (model_evaluation(f"{classifier_condition}, {alpha[idx]}", f"fold_{z+1}", x_test, y_test, prediction, model, end-start, n_classes)) z+=1 range = ((idx*10) + (idx + 1)) save_data[f"Average {classifier_condition}, n = {alpha[idx]}"] = save_data.iloc[:,range:].mean(axis=1) save_data.to_csv(f"{classifier_condition}_new.csv") #------------------------------------------- # RUN CLASSIFIER WITH ISOMAP IMPLEMENTATION #------------------------------------------- '''ISOMAP is so slow that the value of n_components is manually adjusted;
# to find best model, we try 3 diff ones svml = SVC() gboost = GradientBoostingClassifier() rf = RandomForestClassifier(n_estimators=100) logreg = LogisticRegressionCV() gaus = GaussianNB() knear = KNeighborsClassifier() models = [logreg, svml, rf, gboost,knear,gaus] #for model in models: # print("Cross-validating: {0}".format(model.__class__)) # score = compute_score(clf=model,x=train_x_reduced, y=train_y) # print("Accuracy of model: {0}".format(score)) # print("*************") model = GradientBoostingClassifier() model.fit(train_x,train_y) output = model.predict(test_x).astype(int) # so we don't get floats passIDs = pd.read_csv("test_titanic.csv") results = pd.DataFrame() results["PassengerId"] = passIDs["PassengerId"] results["Survived"] = output print(results.shape) results.to_csv("titanicsubmission.csv",index=False)
X_train,X_test,y_train,y_test=train_test_split(train_reduced,targets,test_size=0.2,random_state=0) parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6} model = RandomForestClassifier(**parameters) model.fit(X_train, y_train) model2=GradientBoostingClassifier() model2.fit(X_train,y_train) model3=SVC() model3.fit(X_train, y_train) compute_score(model, X_test, y_test, scoring='accuracy') y=model.predict(test_reduced) testset=pd.read_csv('test.csv') my_submission = pd.DataFrame({'PassengerId': testset.PassengerId, 'Survived': y}) my_submission.to_csv('pred2.csv', index=False) #To test different hyperparameter combinations of RandomForestClassifier '''run_gs = True if run_gs: parameter_grid = { 'max_depth' : [4, 6, 8], 'n_estimators': [50, 10], 'max_features': ['sqrt', 'auto', 'log2'], 'min_samples_split': [1.0, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False],
#ans = bdt_discrete.predict(final_model_test_data); ans = clf_2.predict(final_model_test_data) #ans = clf_3.predict(final_model_test_data); #ans = clf_4.predict(final_model_test_data); for mn in range(len(ans)): if (ans[mn] == 0): classify = 0 else: classify = 1 ##############################1ST###################################################### if (classify == 0): q = np.matrix([final_model_test_data[mn]]) predictions = model.predict(q) r.append(predictions) ####################################################################################### elif classify == 1: q = np.matrix([final_model_test_data[mn]]) predictions = model_2.predict(q) r.append(predictions) ####################################################################################### predicted_results = r for t in range(len(r)): if (r[t] - int(r[t])) > 0.5: predicted_results[t] = math.ceil(r[t] * 10 / 10) else: predicted_results[t] = int(r[t])
min_child_samples=6, min_child_weight=0, subsample=0.8, colsample_bytree=0.7, reg_alpha=0, importance_type="split") for train_ix, test_ix in cv.split(X_train): X_cvtrain, X_cvtest = X_train.iloc[train_ix, :], X_train.iloc[test_ix, :] y_cvtrain, y_cvtest = y_train["y"].iloc[train_ix], y_train["y"].iloc[ test_ix] model.fit(X_cvtrain, y_cvtrain) predtrain = model.predict(X_cvtrain) pred = model.predict(X_cvtest) print("\nTrain R2:") print(np.round(r2_score(y_cvtrain, predtrain), 2)) print("\nTest R2:") print(np.round(r2_score(y_cvtest, pred), 2)) print("\n________________________") R2.append(np.round(r2_score(y_cvtest, pred), 4)) print("\nAverage R2:", round(np.sum(R2) / 5, 2)) print("Std:", round(np.std(R2), 4)) # Predict Test Data
imp_feat.nlargest(20).plot(kind='barh') ind = np.argsort(imp) rf.feature_importances_ # Stacking model xcl_train, xcl_test, ycl_train, ycl_test = train_test_split(x_cl, y_cl, test_size=0.3) #Support vector classifier #svm=SVC(C=5, probability=True,gamma='auto') #svm.fit(xcl_train,ycl_train) # # lr = LogisticRegressionCV(cv=10) lr.fit(x_cl, y_cl) metrics.accuracy_score(y_cl, lr.predict(x_cl)) metrics.roc_auc_score(y_cl, lr.predict(x_cl)) #nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED) gb = GradientBoostingClassifier(n_estimators=100) gb.fit(xcl_train, ycl_train) metrics.accuracy_score(ycl_test, gb.predict(xcl_test)) #rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED) #Predicting Monthly Revenue reg_df = temp cl_df['Churn'] reg_df['Revenue_loss'] = cl_df.Churn * reg_df['MonthlyRevenue'] reg_df['Revenue_loss'] reg_df = reg_df.drop(['MonthlyRevenue'], axis=1) #Casting categorical types for n, v in reg_df.iteritems():
random_forest = GridSearchCV(RandomForestClassifier(class_weight="balanced", random_state=123), rf_param_grid, cv=kfold, n_jobs=-1, refit=True, scoring="roc_auc") random_forest.fit(X_train_rf_selected, y_train) print( f'Best score: {random_forest.best_score_} with param: {random_forest.best_params_}' ) X_test_rf_selected = X_test[X_test.columns.intersection(rf_selected_features)] y_rf_predictions = random_forest.predict(X_test_rf_selected) conf_matrix = metrics.confusion_matrix(y_test, y_rf_predictions) sns.heatmap(pd.DataFrame(conf_matrix), annot=True, fmt='g', cmap='coolwarm_r') plt.title('Random Forests') plt.ylabel('Actual label') plt.xlabel('Predicted label') #plt.savefig('RF_CM.png', quality=95) plt.show() print(f"Accuracy: {metrics.accuracy_score(y_test, y_rf_predictions)}") print(classification_report(y_test, y_rf_predictions)) y_pred_prob_rf = random_forest.predict_proba(X_test_rf_selected)[::, 1] fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_prob_rf) auc = metrics.roc_auc_score(y_test, y_pred_prob_rf)
def main(): data = pd.read_csv('selfie_dataset.txt', sep=" ", header=None, names=["Nome","Rate", "partial_faces", "is_female", "baby", "child","teenager", "youth", "middle_age","senior", "white", "black","asian", "oval_face", "round_face", "heart_face", "smiling", "mouth_open","frowning", "wearing_glasses", "wearing_sunglasses","wearing_lipstick","2tongue_out0", "duck_face","black_hair", "blond_hair", "brown_hair","red_hair", "curly_hair", "straight_hair","braid_hair", "showing_cellphone", "using_earphone","using_mirror", "wearing_hat" ,"braces","harsh_lighting","dim_lighting"]) labels = np.array(data['Rate']) features1= data.drop("Rate", axis = 1) features= features1.drop("Nome", axis = 1) feature_list = list(features.columns) features = np.array(features) train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.1,random_state=0) sc = StandardScaler() train_features = sc.fit_transform(train_features) test_features = sc.transform(test_features) print('The shape of our train_features is:', train_features.shape) print('The shape of our test_features is:', test_features.shape) isTrained = False min_importance = 0.04 n_estimators = 200 retrain = True if(isTrained): if(retrain): crf = joblib.load("regressor.pkl") rf = SelectFromModel(crf, threshold=min_importance) rf.fit(train_features, train_labels) train_features = rf.transform(train_features) test_features = rf.transform(test_features) print('The shape of our important_train_features is:', train_features.shape) print('The shape of our important_test_features is:', test_features.shape) rf_important = RandomForestRegressor(n_estimators=n_estimators,random_state=1) rf_important.fit(train_features, train_labels) rf = rf_important print(rf_important) print("\n\n") predictions = rf_important.predict(test_features) importances = list(rf_important.feature_importances_) else: rf = joblib.load("regressor.pkl") print(rf) print("\n\n") predictions = rf.predict(test_features) importances = list(rf.feature_importances_) else: rf = RandomForestRegressor(n_estimators = n_estimators,oob_score=True,random_state=2) rf.fit(train_features, train_labels) joblib.dump(rf, 'regressor.pkl') print(rf) print("\n\n") predictions = rf.predict(test_features) importances = list(rf.feature_importances_) print('Mean Absolute Error:', mean_absolute_error(test_labels,predictions)) mape = np.mean(np.abs((test_labels - predictions) / test_labels)) * 100 accuracy = 100 - mape print('Accuracy:', round(accuracy, 2), '%') print('Variance Score: ', explained_variance_score(test_labels,predictions)) print("\n\n") print("Importances: ") importances = list(rf.feature_importances_) feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(feature_list, importances)] feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) for pair in feature_importances: print('{} : {}'.format(*pair)) print()
#print(test_reduced.shape) parameters = { 'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6 } model = RandomForestClassifier(**parameters) model.fit(training_df_processed, survived_column) print( compute_score(model, training_df_processed, survived_column, scoring='accuracy')) output = model.predict(test_df_processed).astype(int) df_output = pandas.DataFrame() aux = pandas.read_csv('D:/ML work/Titanic Data/test.csv') df_output['PassengerId'] = aux['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('D:/ML work/Titanic Data/output.csv', index=False)
parameters = { 'bootstrap': False, 'min_samples_leaf': 4, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 5 } model = RandomForestClassifier(**parameters) model.fit(X_train_reduced, Y_train) # In[133]: output = model.predict(test1_reduced).astype(int) model1 = round(model.score(X_train_reduced, Y_train) * 100, 2) model1 # #### Applying Random Forest Classifier. One can play with parameters (hyperparameter tuning to increase score). I have achieved .803 with less feature engineering. However, as i increased the number of dummies for age, it came down to 78.9. # In[131]: output = model.predict(test1_reduced).astype(int) submission = pd.DataFrame({ "PassengerId": test["PassengerId"], "Survived": output }) submission.to_csv("titanic51_submission.csv", index=False) # I'll update as i improve. Your guidance is appreciated. Also thanks a lot for all the tutorials where i learned a lot.
## Evaluation (No tuning) ------------> (1) prediction = rf_classifier.predict(X_test) print('Confusion Matrix\n', confusion_matrix(y_test, prediction)) print('Accuracy Score: ', accuracy_score(y_test, prediction)) print('Classification Report:\n', classification_report(y_test, prediction)) ### Manual Hyperparameter Tuning model = RandomForestClassifier(n_estimators=300, criterion='entropy', max_features='sqrt', min_samples_leaf=10, random_state=100).fit(X_train, y_train) ## Evaluation (Manual tuning) ------------> (2) predictions = model.predict(X_test) print(confusion_matrix(y_test, predictions)) print(accuracy_score(y_test, predictions)) print(classification_report(y_test, predictions)) ### Randomized Search Cv n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)] max_features = ['auto', 'sqrt', 'log2'] max_depth = [int(x) for x in np.linspace(10, 1000, 10)] min_samples_split = [2, 5, 10, 14] min_samples_leaf = [1, 2, 4, 6, 8] random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
else: parameters = { 'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6 } model = RandomForestClassifier(**parameters) model.fit(train, targets) # In[138]: Y_pred = model.predict(test).astype(int) # ## Model, predict and solve # In[35]: X_train = train_df.drop("Survived", axis=1) Y_train = train_df["Survived"] X_test = test_df.drop("PassengerId", axis=1).copy() X_train.shape, Y_train.shape, X_test.shape # In[36]: # Logistic Regression logreg = LogisticRegression()
verbose=1) grid_search = grid_search.fit(train_reduced, targets) params = grid_search.best_params_ print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) # ###### Now we use the result of Best params as hyperparameters to train our final machine learning "model" # In[ ]: model = RandomForestClassifier(**params) model.fit(train_reduced, targets) print(compute_score(clf=model, X=train_reduced, y=targets, scoring='accuracy')) # ### So we obtain a 82+% classification accuracy with our machine learning model. We will submit our prediction to check how good we did in our test data set. We will use the predictions made from Random Forest y_pred. # In[ ]: y_pred = model.predict(test_reduced).astype(int) # In[ ]: test_data = pd.read_csv('../input/test.csv') submission = pd.DataFrame({ "PassengerId": test_data["PassengerId"], "Survived": y_pred }) submission.to_csv('titanic.csv', index=False)
coef0=0.0, decision_function_shape=None, degree=3, gamma=1e-05, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) #max iter -> eğitebildiğin kadar eğit dedik -1 ile. model.fit(X_train, y_train) print "Eğitim bitti" '''eğitim uzun sürüyor cache size vb parametreler sürede etkili''' ''' onceden yarattıgın pickle dosyasını cagırmak için; from sklearn.externals import joblib model = joblib.load('yeni_model.pkl') model.predict(X_test[150,:]) ''' from sklearn.externals import joblib joblib.dump( best_model, 'malware_model.pkl' ) #pickle a cevirme yerden tasarruf ve tekrar yuklendiginde kolaylik import pickle
name, classes=sorted(list(set(ground_truth))), normalize=True, title='Normalized confusion matrix') # In[7]: #----- Feature importance ranking -----# for name, model in [('RandomForest_recursive', RandomForestClassifier())]: print('Performing recursive feature elimination : ', name) selector = RFECV(model, step=20, cv=10) selector = selector.fit(data_train, ground_truth) y_pred = selector.predict(data_train) cm = confusion_matrix(ground_truth, y_pred, labels=sorted(list(set(ground_truth)))) #----- plot, print and save train results -----# plot_recall(name, cm, train_directory) plot_confusion_matrix(cm, train_directory, name, classes=sorted(list(set(ground_truth))), normalize=True, title='Normalized confusion matrix') #----- save the model to disk -----# filename = os.path.join(train_directory, name) + '_model.sav'
grid_search.fit(train, targets) model = grid_search parameters = grid_search.best_params_ print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) else: parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6} model = RandomForestClassifier(**parameters) model.fit(train, targets) output = model.predict(test).astype(int) df_output = pd.DataFrame() aux = pd.read_csv('test.csv') df_output['PassengerId'] = aux['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('gridsearch_rf.csv', index=False) trained_models = [] for model in models: model.fit(train, targets) trained_models.append(model) predictions = [] for model in trained_models: predictions.append(model.predict_proba(test)[:, 1])
class Predictor: def __init__(self, data_directory='matches'): self._regressor = None self._model = None self._X_train = None self._X_test = None self._y_train = None self._y_test = None data = self._read_data(data_directory) self._create_features(data) self._create_regressor() self._train_model() @property def accuracy(self): predicted = self.predict(self._X_test, int) accuracy = round(accuracy_score(self._y_test, predicted) * 100.0, 2) print 'Accuracy: %s%%' % accuracy def print_tree(self): dot_data = StringIO() i = 1 for tree_in_forest in self._model.estimators_: dot_data = tree.export_graphviz( tree_in_forest, out_file='tree_%s.dot' % str(i), feature_names=self._filtered_features, class_names=['away_win', 'home_win'], filled=True, rounded=True, special_characters=True) i += 1 def simplify(self, test_data): test_data = test_data.loc[:, test_data.columns.isin(self. _filtered_features)] test_data = test_data.reindex(self._filtered_features, axis=1) self._X_test = self._X_test.reindex(self._filtered_features, axis=1) parameters = { 'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6 } self._model = RandomForestRegressor(**parameters) self._model.fit(self._X_train, self._y_train) return test_data def predict(self, test_data, output_datatype): return self._model.predict(test_data).astype(output_datatype) def _read_data(self, data_directory): frames = [pd.read_pickle(match) for match in \ glob('%s/*/*' % data_directory)] data = pd.concat(frames) data.drop_duplicates(inplace=True) data = filter_stats(data) data = data.dropna() data['home_free_throw_percentage'].fillna(0, inplace=True) data['away_free_throw_percentage'].fillna(0, inplace=True) data['points_difference'] = data['home_points'] - data['away_points'] return differential_vector(data) def _create_features(self, data): X = data.drop('away_points', 1) X = X.drop('home_points', 1) y = data[['home_points', 'away_points']].values split_data = train_test_split(X, y) self._X_train, self._X_test, self._y_train, self._y_test = split_data def _create_regressor(self): reg = RandomForestRegressor(n_estimators=50, max_features='sqrt') self._regressor = reg.fit(self._X_train, self._y_train) def _train_model(self): train = self._X_train self._model = SelectFromModel(self._regressor, prefit=True, threshold=0.01) self._X_train = self._model.transform(self._X_train) new_columns = train.columns[self._model.get_support()] self._filtered_features = [str(col) for col in new_columns]
#计算变量间的相关系数# for i in range(len(feature)): selection.append([ feature[i], corr.loc[:, [feature[i]]][(np.abs(corr.loc[:, [feature[i]]].values) > 0.8) & (corr.loc[:, [feature[i]]].values != 1)].index ]) ##调参用sklearn的api## ###预测 #### test = pd.read_csv("./data/crawler_test.txt", sep='\t') test.head() dtest = xgb.DMatrix(data=test.loc[:, feature].astype('float')) preds = model.predict(dtest) dfff = pd.concat([ test.loc[:, ["clientid", "updatetime", "label", "score"]], pd.DataFrame(preds, columns=["pred"]) ], axis=1) dfff dfff.to_csv("crawler_0708prediction.csv", index=False, header=False) ###测试模型文件线上线下是否一致## df_test = pd.DataFrame({ "allianceid": ["na"], "avginterval2minutes": [799.4], "avginterval5minutes": [813.13336], "clientid": ["09031172210287250176"], "clientip2minutes": [1],
model.compile(optimizer=optim, loss='categorical_crossentropy', metrics=['accuracy']) # model.fit(x_ktrain, y_ktrain, batch_size=100, epochs=100, verbose=1) # y_kpred = np.argmax(model.predict(x_ktest), axis=1) # score = balanced_accuracy_score(y_ktest, y_kpred) # print(score) ############### model training y_clean = keras.utils.to_categorical(y_clean, 3) model.fit(x_clean, y_clean, batch_size=50, epochs=100, verbose=1) y_pred_mat[:, k] = np.argmax(model.predict(x_test_selected), axis=1) y_pred = np.zeros(y_testid.shape[0]) for j in range(y_pred_mat.shape[0]): y_pred[j] = Counter(y_pred_mat[j]).most_common(1)[0][0] print('vote_mat:', y_pred_mat[:5]) print('vote_result:', y_pred[:5]) # # ################ write output file with open('output.csv', 'w') as f: f.write("{},{}\n".format("id", "y")) for i in range(len(y_testid)): f.write("{},{}\n".format(y_testid[i], y_pred[i]))