def random_forest(X_train, X_test, y_train, y_test):
    '''
    Fits the random forest determined by grid search to the training data, returns various scores.
    INPUT: The train test split.
    '''
    rf_model = RandomForestClassifier(bootstrap=False,
                                      class_weight=None,
                                      criterion='gini',
                                      max_depth=None,
                                      max_features=4,
                                      max_leaf_nodes=None,
                                      min_samples_leaf=9,
                                      min_samples_split=9,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=20,
                                      n_jobs=1,
                                      oob_score=False,
                                      random_state=456,
                                      verbose=0,
                                      warm_start=False)

    rf_model.fit(X_train, y_train)

    y_pred = rf_model.predict(X_test)

    print score_models(y_test, y_pred)
Ejemplo n.º 2
0
def baseline_classifier(y_test):
    '''
    Scores a "stupid" model that just predicts the majority class with probability equal to the proportion of happy-ending movies.
    '''
    y_pred_base_odds = []

    for i in range(len(y_test)):
        num = np.random.choice(np.arange(0,2), p=[(1-.535), .535])
        y_pred_base_odds.append(num)

    print score_models(y_test, y_pred_base_odds)
Ejemplo n.º 3
0
def majority_classifier(y_test):
    '''
    Scores a "stupid" model that just predicts the majority class every time (except once to avoid divide by zero error.)
    '''
    y_pred_always_happy = [1]*(len(y_test))
    print score_models(y_test, y_pred_always_happy)