def train_cv(): # ---------------------- load the data train_df = pd.read_csv("train_processed.csv",index_col="PassengerId") Xtrain = train_df[feature_names] ytrain = train_df["Survived"] # ---------------------- train loss = ['deviance', 'exponential'] learning_rate = np.logspace(-5,1) n_estimate_dist = sp_randint(1000,4800) max_depth_dist = sp_randint(1,10) param_dist = dict(loss=loss, learning_rate=learning_rate, n_estimators=n_estimate_dist, max_depth=max_depth_dist) gbdt = GradientBoostingClassifier(verbose=1) searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=210,verbose=1,n_jobs=-1) print "--------------------- RandomizedSearchCV begins" searchcv.fit(Xtrain,ytrain) print "--------------------- RandomizedSearchCV ends" print "best score: ",searchcv.best_score_ print "best parameters: ",searchcv.best_params_ common.dump_predictor('gbdt-cv.pkl',searchcv.best_estimator_) print "--------------------- GBDT saved into file"
def search_best_rf(): Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl") print "training data loaded" print_label_frequency(ytrain_raw) ############# create the pipeline pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=do_nothing)), ('tfidf', TfidfTransformer()), ('rf', RandomForestClassifier(oob_score=True, verbose=1)), ]) ############# initialize the search parameters = { 'vect__max_features': (2000,3000,4000), 'rf__n_estimators': range(300,1200,100), 'rf__criterion':['gini','entropy'], 'rf__max_depth': range(10,100,10), 'rf__min_samples_split': range(10,100,10), } validate_split = PredefinedSplit(test_fold=make_train_validate_split(len(ytrain_raw))) scoring_method = "roc_auc" searchcv = RandomizedSearchCV(estimator=pipeline, param_distributions=parameters, n_iter=200, scoring=scoring_method, n_jobs=-1, verbose=1, cv = validate_split) ############# search print "#################### search cv begins" searchcv.fit(Xtrain_raw, ytrain_raw) print "#################### search cv ends" print "best {}: {}".format(scoring_method, searchcv.best_score_) print "best parameters: ", searchcv.best_params_ ############# check the best model bestpipeline = searchcv.best_estimator_ common.dump_predictor("pipeline_rf.pkl",bestpipeline) rf = bestpipeline.steps[-1][1] print "RF's OOB score: {}".format(rf.oob_score_) # words = bestpipeline.steps[0][1].get_feature_names() # feat_importances = zip(words, rf.feature_importances_) # feat_importances.sort(key=lambda t: -t[1]) # print feat_importances ############# training error analysis ytrain_predict = bestpipeline.predict(Xtrain_raw) print_classification_report('Training Data', ytrain_raw, ytrain_predict) ############# test error analysis Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl") ytest_predict = bestpipeline.predict(Xtest_raw) print_classification_report('Testing Data', ytest_raw, ytest_predict)
def train(): train_df = pd.read_csv("train_processed.csv",index_col="PassengerId") Xtrain = train_df[feature_names] ytrain = train_df["Survived"] param_dist = {"n_estimators": sp_randint(500,3000), "max_depth": [2,3, 4,5,6,None], "criterion": ["gini", "entropy"]} njobs = 4 rf = RandomForestClassifier(oob_score=True,verbose=1,n_jobs=njobs) searchcv = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,n_iter=200,n_jobs=njobs) print "#################### search cv begins" searchcv.fit(Xtrain,ytrain) print "#################### search cv ends" print "best score: ",searchcv.best_score_ print "best parameters: ",searchcv.best_params_ common.dump_predictor('rf.pkl',searchcv.best_estimator_) print "*** RF saved into file"
def train_cv(): # --------------------- load train data train_df = pd.read_csv("train_processed.csv", index_col="PassengerId") ytrain = train_df["Survived"] Xtrain_scaled = skpreprocess.scale(train_df[feature_names]) param_dist = {"n_neighbors": np.arange(2, 11)} knn = KNeighborsClassifier() searchcv = GridSearchCV(estimator=knn, param_grid=param_dist, n_jobs=4, cv=10) print "#################### search cv begins" searchcv.fit(Xtrain_scaled, ytrain) print "#################### search cv ends" print "best score: ", searchcv.best_score_ print "best parameters: ", searchcv.best_params_ common.dump_predictor('knn-cv.pkl', searchcv.best_estimator_) print "*** RF saved into file"
def train_whole(): train_df = pd.read_csv("train_processed.csv",index_col="PassengerId") Xtrain = train_df[feature_names] ytrain = train_df["Survived"] # ------------------------------ load # this estimator is trained on partial dataset, without using the valiation part prev_estimator = common.load_predictor("gbdt-cv.pkl") print "cross-validation score: %f"%(prev_estimator.score(Xtrain,ytrain)) # ------------------------------ train # after we get the paramters, we should train another estimator with all data gbdt = GradientBoostingClassifier(verbose=1, loss=prev_estimator.loss, learning_rate = prev_estimator.learning_rate, n_estimators = prev_estimator.n_estimators, max_depth = prev_estimator.max_depth) print gbdt gbdt.fit(Xtrain,ytrain) print "training with all data, get score: ",gbdt.score(Xtrain,ytrain) # ------------------------------ save common.dump_predictor("gbdt.pkl",gbdt)