Esempio n. 1
0
def main():
    ### Loading TRAIN Data
    df_train = load_df("train_v2.csv")

    ### Loading TEST Data
    df_test = load_df('test_v2.csv')

    ### Fix outliers that do not fall within +/-3 std dev from mean of log transaction value
    
    # create a new dummy column logTransaction which is the log of all totalTransactionRevenue
    df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
    std_dev = df_train.logTransaction.std()
    mean_val = df_train.logTransaction.mean()
    df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction)

    # create a new dummy column logTransaction which is the log of all totalTransactionRevenue
    df_test['logTransaction']= df_test['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
    std_dev = df_test.logTransaction.std()
    mean_val = df_test.logTransaction.mean()
    df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction)

    ### Extract Labels 
    y = df_train['logTransaction']
    # Get true values from test set
    y_true = df_test['logTransaction']

    ### Removing unnecessary columns 

    # colums that contain no data
    ones = unique_valued_cols(df_train)
    cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])]
    cols_to_remove.append(['hits', 'customDimensions'])

    # Drop them
    df_train = drop_cols(df_train, list(cols_to_remove))
    df_test = drop_cols(df_test, list(cols_to_remove))

    # Remove transaction related columns
    transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction']
    df_train = drop_cols(df_train, transaction_cols)
    df_test = drop_cols(df_test, transaction_cols)

    # Remove extra column in training
    df_train = df_train.drop('trafficSource.campaignCode', axis=1)

    ### Preprocess the data before we start training
    df_train = preprocess(df_train)
    df_test = preprocess(df_test)

    ### Create categorical and numeric features dataframe
    df_categorical = df_train.select_dtypes(include=['object'])
    df_categorical_test = df_test.select_dtypes(include=['object'])

    # Numeric
    df_numeric = df_train.select_dtypes(include=['float64', 'int64'])
    df_numeric_test = df_test.select_dtypes(include=['float64', 'int64'])

    # Label encoding
    df_categorical = label_encoding(df_categorical)
    df_categorical_test = label_encoding(df_categorical_test)

    ### Training and Predictions

    ################### Categorical ###############
    reg_tree = tree.DecisionTreeRegressor()
    model_cat, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_categorical, df_categorical_test, y, y_true)

    for idx, each in enumerate(df_categorical.columns):
        print(idx, each)

    for idx, each in enumerate(model_cat.feature_importances_):
        print(idx, each*1e5)

    print('-'*10)
    print('\n')
    print("-- Scores for Categorical --")
    print("RMSE on test: ", RMSE_test)
    print("RMSE on train: ", RMSE_train)
    print('\n\n')

    print('-- Getting list of columns for categorical model --\n')
    for each in df_categorical.columns:
        print(each)
    for imp in model_cat.feature_importances_:
        print(imp)

    # Save Categorical model
    joblib.dump(model_cat, "modl_DT_cat.joblib")

    ###################### Numerical #####################
    df_numeric = df_numeric.fillna(0)
    df_numeric_test = df_numeric_test.fillna(0)

    model_num, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_numeric, df_numeric_test, y, y_true)

    for idx, each in enumerate(df_numeric.columns):
        print(idx, each)

    for idx, each in enumerate(model_num.feature_importances_):
        print(idx, each*1e5)

    print('-'*10)
    print('\n')
    print("-- Scores for Numerical --")
    print("RMSE on test: ", RMSE_test)
    print("RMSE on train: ", RMSE_train)
    print('\n\n')

    print('-- Getting list of columns for Numerical Model --\n')
    for each in df_numeric.columns:
        print(each)
    for imp in model_num.feature_importances_:
        print(imp)

    # Save Numerical model
    joblib.dump(model_num, "modl_DT_num.joblib")

    ###################### Full #####################
    df_train = pd.concat([df_numeric,df_categorical],axis=1)
    df_test = pd.concat([df_numeric_test,df_categorical_test],axis=1)

    model_full, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_train, df_test, y, y_true)

    for idx, each in enumerate(df_train.columns):
        print(idx, each)

    for idx, each in enumerate(model_full.feature_importances_):
        print(idx, each*1e5)

    print('-'*10)
    print('\n')
    print("-- Scores for Full --")
    print("RMSE on test: ", RMSE_test)
    print("RMSE on train: ", RMSE_train)
    print('\n\n')

    print('-- Getting list of columns for Full model --\n')
    for each in df_train.columns:
        print(each)
    for imp in model_full.feature_importances_:
        print(imp)

    # Save full model
    joblib.dump(model_full, "modl_DT_full.joblib")
Esempio n. 2
0
df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction)

### Extract Labels 

# Get true values from test set
y_true = df_test['logTransaction']

### Removing unnecessary columns 

# colums that contain no data
ones = unique_valued_cols(df_test)
cols_to_remove = [x for x in ones if set(df_test[x].unique()) == set(['not available in demo dataset'])]
cols_to_remove.extend(['hits', 'customDimensions', 'device.isMobile'])

# Drop them
df_test = drop_cols(df_test, list(cols_to_remove))

# Remove transaction related columns
transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction']
df_test = drop_cols(df_test, transaction_cols)

### Preprocess the data before we start training
# print(df_test.iloc[0])
df_test = df_test.fillna(0)
df_test = label_encoding(df_test)
df_test = preprocess(df_test)

### Get Predictions
pred = mdl.predict(df_test)

vals = []
Esempio n. 3
0
def main():
    ### Loading TRAIN Data
    df_train = load_df("train_v2.csv",10000)

    ### Loading TEST Data
    df_test = load_df('test_v2.csv', 10000)

    ### Fix outliers that do not fall within +/-3 std dev from mean of log transaction value
    
    # create a new dummy column logTransaction which is the log of all totalTransactionRevenue
    df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
    std_dev = df_train.logTransaction.std()
    mean_val = df_train.logTransaction.mean()
    df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction)

    # create a new dummy column logTransaction which is the log of all totalTransactionRevenue
    df_test['logTransaction']= df_test['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
    std_dev = df_test.logTransaction.std()
    mean_val = df_test.logTransaction.mean()
    df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction)

    ### Extract Labels 
    y = df_train['logTransaction']
    # Get true values from test set
    y_true = df_test['logTransaction']

    ### Removing unnecessary columns 

    # colums that contain no data
    ones = unique_valued_cols(df_train)
    cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])]
    cols_to_remove.extend(['hits','customDimensions'])


    # Drop them from both the sets
    df_train = drop_cols(df_train, list(cols_to_remove))
    df_test = drop_cols(df_test, list(cols_to_remove))

    # Remove transaction related columns
    transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction']
    df_train = drop_cols(df_train, transaction_cols)
    df_test = drop_cols(df_test, transaction_cols)

    # Remove extra column in training
    #df_train = df_train.drop('trafficSource.campaignCode', axis=1)

    ### Preprocess the data before we start training
    df_train = preprocess(df_train)
    df_test = preprocess(df_test)

    ### Create categorical and numeric features dataframe
    df_categorical = df_train.select_dtypes(include=['object'])
    df_categorical_test = df_test.select_dtypes(include=['object'])

    # Numeric
    df_numeric = df_train.select_dtypes(include=['float64', 'int64'])
    df_numeric_test = df_test.select_dtypes(include=['float64', 'int64'])

    # Label encoding on categorical
    df_categorical = label_encoding(df_categorical)
    df_categorical_test = label_encoding(df_categorical_test)

    # Predict on categorical
    lm = LinearRegression()
    categorize_model, RMSE_test, RMSE_train = train_and_predict(lm, df_categorical, df_categorical_test, y, y_true)

    print("In-sample RMSE categorical:", RMSE_train)
    print("Out-sample RMSE categorical:", RMSE_test)
    print("Model parameters: ", categorize_model.get_params())

    joblib.dump(categorize_model, "modl_LR_cat.joblib")

    print('-'*10)


    # Predict on numerical
    df_numeric = df_numeric.fillna(0)
    df_numeric_test = df_numeric_test.fillna(0)

    num_model, RMSE_test, RMSE_train = train_and_predict(lm, df_numeric, df_numeric_test, y, y_true)

    print("In-sample RMSE numeric:", RMSE_train)
    print("Out-sample RMSE numeric:", RMSE_test)
    print("Model parameters: ", num_model.get_params())

    joblib.dump(num_model, "modl_LR_cat.joblib")

    print('-'*10)

    # Predict on all features
    df_train = pd.concat([df_numeric,df_categorical],axis=1)
    df_test = pd.concat([df_numeric_test,df_categorical_test],axis=1)
    full_model, RMSE_test, RMSE_train = train_and_predict(lm, df_train, df_test, y, y_true)

    print("In-sample RMSE:", RMSE_train)
    print("Out-sample RMSE:", RMSE_test)
    print("Model parameters: ", full_model.get_params())

    joblib.dump(full_model, "modl_LR_cat.joblib")

    print('-'*10)
Esempio n. 4
0
# create a new dummy column logTransaction which is the log of all totalTransactionRevenue
df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
std_dev = df_train.logTransaction.std()
mean_val = df_train.logTransaction.mean()
df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction)
y = df_train['logTransaction']

# Remove colums that contain no data
ones = unique_valued_cols(df_train)
cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])]
df_train = df_train.drop(cols_to_remove, axis=1)

# Remove transaction related columns
transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction']
df_train = drop_cols(df_train, transaction_cols)

# Remove extra column in training
df_train = df_train.drop('trafficSource.campaignCode', axis=1)

### Preprocess the data before we start training
df_train = preprocess(df_train)

# Get the categorical variables
df_categorical = df_train.select_dtypes(include=['object'])

# add logTransaction (dependent variable) column
df_categorical['logTransaction'] = y

# delete train set as we don't need it anymore
del df_train