def train_cv():
    # ---------------------- load the data
    train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")
    Xtrain = train_df[feature_names]
    ytrain = train_df["Survived"]

    # ---------------------- train
    loss = ['deviance', 'exponential']
    learning_rate = np.logspace(-5,1)
    n_estimate_dist = sp_randint(1000,4800)
    max_depth_dist = sp_randint(1,10)
    param_dist = dict(loss=loss,
                    learning_rate=learning_rate,
                    n_estimators=n_estimate_dist,
                    max_depth=max_depth_dist)

    gbdt = GradientBoostingClassifier(verbose=1)
    searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=210,verbose=1,n_jobs=-1)

    print "--------------------- RandomizedSearchCV begins"
    searchcv.fit(Xtrain,ytrain)      
    print "--------------------- RandomizedSearchCV ends"
    print "best score: ",searchcv.best_score_                                  
    print "best parameters: ",searchcv.best_params_

    common.dump_predictor('gbdt-cv.pkl',searchcv.best_estimator_)
    print "--------------------- GBDT saved into file"
def search_best_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=do_nothing)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(oob_score=True, verbose=1)),
    ])

    ############# initialize the search
    parameters = {
        'vect__max_features': (2000,3000,4000),
        'rf__n_estimators': range(300,1200,100),
        'rf__criterion':['gini','entropy'],
        'rf__max_depth': range(10,100,10),
        'rf__min_samples_split': range(10,100,10),
    }
    validate_split = PredefinedSplit(test_fold=make_train_validate_split(len(ytrain_raw)))

    scoring_method = "roc_auc"
    searchcv = RandomizedSearchCV(estimator=pipeline,
                                param_distributions=parameters,
                                n_iter=200,
                                scoring=scoring_method,
                                n_jobs=-1,
                                verbose=1,
                                cv = validate_split)

    ############# search
    print "#################### search cv begins"
    searchcv.fit(Xtrain_raw, ytrain_raw)
    print "#################### search cv ends"
    print "best {}: {}".format(scoring_method, searchcv.best_score_)
    print "best parameters: ", searchcv.best_params_

    ############# check the best model
    bestpipeline = searchcv.best_estimator_
    common.dump_predictor("pipeline_rf.pkl",bestpipeline)

    rf = bestpipeline.steps[-1][1]
    print "RF's OOB score: {}".format(rf.oob_score_)

    # words = bestpipeline.steps[0][1].get_feature_names()
    # feat_importances = zip(words, rf.feature_importances_)
    # feat_importances.sort(key=lambda t: -t[1])
    # print feat_importances

    ############# training error analysis
    ytrain_predict = bestpipeline.predict(Xtrain_raw)
    print_classification_report('Training Data', ytrain_raw, ytrain_predict)

    ############# test error analysis
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = bestpipeline.predict(Xtest_raw)
    print_classification_report('Testing Data', ytest_raw, ytest_predict)
Beispiel #3
0
def train():
    train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")
    Xtrain = train_df[feature_names]
    ytrain = train_df["Survived"]

    param_dist = {"n_estimators":  sp_randint(500,3000),                
                  "max_depth": [2,3, 4,5,6,None],              
                  "criterion": ["gini", "entropy"]}
    njobs = 4
    rf = RandomForestClassifier(oob_score=True,verbose=1,n_jobs=njobs)
    searchcv = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,n_iter=200,n_jobs=njobs)

    print "#################### search cv begins"
    searchcv.fit(Xtrain,ytrain)    
    print "#################### search cv ends"
    print "best score: ",searchcv.best_score_                                  
    print "best parameters: ",searchcv.best_params_

    common.dump_predictor('rf.pkl',searchcv.best_estimator_)
    print "*** RF saved into file"
Beispiel #4
0
def train_cv():
    # --------------------- load train data
    train_df = pd.read_csv("train_processed.csv", index_col="PassengerId")
    ytrain = train_df["Survived"]
    Xtrain_scaled = skpreprocess.scale(train_df[feature_names])

    param_dist = {"n_neighbors": np.arange(2, 11)}
    knn = KNeighborsClassifier()
    searchcv = GridSearchCV(estimator=knn,
                            param_grid=param_dist,
                            n_jobs=4,
                            cv=10)

    print "#################### search cv begins"
    searchcv.fit(Xtrain_scaled, ytrain)
    print "#################### search cv ends"
    print "best score: ", searchcv.best_score_
    print "best parameters: ", searchcv.best_params_

    common.dump_predictor('knn-cv.pkl', searchcv.best_estimator_)
    print "*** RF saved into file"
def train_whole():
    train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")
    Xtrain = train_df[feature_names]
    ytrain = train_df["Survived"]

    # ------------------------------ load
    # this estimator is trained on partial dataset, without using the valiation part
    prev_estimator = common.load_predictor("gbdt-cv.pkl")
    print "cross-validation score: %f"%(prev_estimator.score(Xtrain,ytrain))

    # ------------------------------ train
    # after we get the paramters, we should train another estimator with all data
    gbdt = GradientBoostingClassifier(verbose=1,
                                      loss=prev_estimator.loss,
                                      learning_rate = prev_estimator.learning_rate,
                                      n_estimators = prev_estimator.n_estimators,
                                      max_depth = prev_estimator.max_depth)
    print gbdt
    gbdt.fit(Xtrain,ytrain)
    print "training with all data, get score: ",gbdt.score(Xtrain,ytrain)

    # ------------------------------ save
    common.dump_predictor("gbdt.pkl",gbdt)