コード例 #1
0
                               loss_function='Logloss',
                               verbose=True)
    # train the model
    model.fit(train_data, train_labels)
    # make the prediction using the resulting model
    preds_class = model.predict(test_data)
    preds_proba = model.predict_proba(test_data)
    print("class = ", preds_class)
    print("proba = ", preds_proba)
    return preds_class, preds_proba


if __name__ == "__main__":

    df_train = clean(
        pd.read_csv(
            '/Users/jacobtryba/DSI/assignments/supervised-learning-case-study/data/churn_train.csv'
        )).drop('months_as_user', axis=1)
    df_test = clean(
        pd.read_csv(
            '/Users/jacobtryba/DSI/assignments/supervised-learning-case-study/data/churn_test.csv'
        )).drop('months_as_user', axis=1)

    X_train, X_test, y_train, y_test = X_y(df_train)

    model = CatBoostClassifier(iterations=2,
                               depth=2,
                               learning_rate=1,
                               loss_function='Logloss',
                               verbose=True)
    # train the model
    model.fit(X_train, y_train)
コード例 #2
0
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
plt.style.use('fivethirtyeight')

if __name__ == "__main__":

    ### Get Data ###

    df_train = clean(pd.read_csv('../data/churn_train.csv')).drop(
        ['avg_rating_by_driver', "city"], axis=1)
    df_test = clean(pd.read_csv('../data/churn_test.csv')).drop(
        ['avg_rating_by_driver', "city"], axis=1)

    X, y = X_y(df_train)
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    X_cols = [
        'avg_dist', 'avg_rating_of_driver', 'avg_surge', 'phone', 'surge_pct',
        'trips_in_first_30_days', 'luxury_car_user', 'weekday_pct'
    ]

    ### Random Forest ###

    rf = RandomForestClassifier(n_estimators=200, max_depth=8)
    rf.fit(X_train, y_train)
コード例 #3
0
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
plt.style.use('fivethirtyeight')

if __name__ == "__main__":

    ### Get Data ###

    df_train = clean(pd.read_csv('../data/churn_train.csv'))
    df_test = clean(pd.read_csv('../data/churn_test.csv'))

    X, y = X_y(df_train)
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    X_cols = [
        'avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver',
        'avg_surge', 'phone', 'surge_pct', 'trips_in_first_30_days',
        'luxury_car_user', 'weekday_pct', 'is_Astapor', "is_King's Landing"
    ]

    ### Random Forest ###

    rf = RandomForestClassifier(n_estimators=200, max_depth=8)
    rf.fit(X_train, y_train)
コード例 #4
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import seaborn as sns
sys.path.append(
    '/home/mike/dsi/case_studies/supervised-learning-case-study/src/')
from clean_df import clean, X_y
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

if __name__ == '__main__':

    df_train = clean(pd.read_csv('../data/churn_train.csv')).drop(
        'months_as_user', axis=1)
    df_test = clean(pd.read_csv('../data/churn_test.csv')).drop(
        'months_as_user', axis=1)

    X_train, X_test, y_train, y_test = X_y(df_train)

    log_model = LogisticRegression(class_weight='balanced')
    log_model.fit(X_train, y_train)

    y_pred_p = log_model.predict_proba(X_test)
    y_pred = log_model.predict(X_test)

    score_train = log_model.score(X_train, y_train)
    score_test = log_model.score(X_test, y_test)

    # ## Validation