def main(): ### Loading TRAIN Data df_train = load_df("train_v2.csv") ### Loading TEST Data df_test = load_df('test_v2.csv') ### Fix outliers that do not fall within +/-3 std dev from mean of log transaction value # create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_train.logTransaction.std() mean_val = df_train.logTransaction.mean() df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction) # create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_test['logTransaction']= df_test['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_test.logTransaction.std() mean_val = df_test.logTransaction.mean() df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction) ### Extract Labels y = df_train['logTransaction'] # Get true values from test set y_true = df_test['logTransaction'] ### Removing unnecessary columns # colums that contain no data ones = unique_valued_cols(df_train) cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])] cols_to_remove.append(['hits', 'customDimensions']) # Drop them df_train = drop_cols(df_train, list(cols_to_remove)) df_test = drop_cols(df_test, list(cols_to_remove)) # Remove transaction related columns transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction'] df_train = drop_cols(df_train, transaction_cols) df_test = drop_cols(df_test, transaction_cols) # Remove extra column in training df_train = df_train.drop('trafficSource.campaignCode', axis=1) ### Preprocess the data before we start training df_train = preprocess(df_train) df_test = preprocess(df_test) ### Create categorical and numeric features dataframe df_categorical = df_train.select_dtypes(include=['object']) df_categorical_test = df_test.select_dtypes(include=['object']) # Numeric df_numeric = df_train.select_dtypes(include=['float64', 'int64']) df_numeric_test = df_test.select_dtypes(include=['float64', 'int64']) # Label encoding df_categorical = label_encoding(df_categorical) df_categorical_test = label_encoding(df_categorical_test) ### Training and Predictions ################### Categorical ############### reg_tree = tree.DecisionTreeRegressor() model_cat, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_categorical, df_categorical_test, y, y_true) for idx, each in enumerate(df_categorical.columns): print(idx, each) for idx, each in enumerate(model_cat.feature_importances_): print(idx, each*1e5) print('-'*10) print('\n') print("-- Scores for Categorical --") print("RMSE on test: ", RMSE_test) print("RMSE on train: ", RMSE_train) print('\n\n') print('-- Getting list of columns for categorical model --\n') for each in df_categorical.columns: print(each) for imp in model_cat.feature_importances_: print(imp) # Save Categorical model joblib.dump(model_cat, "modl_DT_cat.joblib") ###################### Numerical ##################### df_numeric = df_numeric.fillna(0) df_numeric_test = df_numeric_test.fillna(0) model_num, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_numeric, df_numeric_test, y, y_true) for idx, each in enumerate(df_numeric.columns): print(idx, each) for idx, each in enumerate(model_num.feature_importances_): print(idx, each*1e5) print('-'*10) print('\n') print("-- Scores for Numerical --") print("RMSE on test: ", RMSE_test) print("RMSE on train: ", RMSE_train) print('\n\n') print('-- Getting list of columns for Numerical Model --\n') for each in df_numeric.columns: print(each) for imp in model_num.feature_importances_: print(imp) # Save Numerical model joblib.dump(model_num, "modl_DT_num.joblib") ###################### Full ##################### df_train = pd.concat([df_numeric,df_categorical],axis=1) df_test = pd.concat([df_numeric_test,df_categorical_test],axis=1) model_full, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_train, df_test, y, y_true) for idx, each in enumerate(df_train.columns): print(idx, each) for idx, each in enumerate(model_full.feature_importances_): print(idx, each*1e5) print('-'*10) print('\n') print("-- Scores for Full --") print("RMSE on test: ", RMSE_test) print("RMSE on train: ", RMSE_train) print('\n\n') print('-- Getting list of columns for Full model --\n') for each in df_train.columns: print(each) for imp in model_full.feature_importances_: print(imp) # Save full model joblib.dump(model_full, "modl_DT_full.joblib")
df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction) ### Extract Labels # Get true values from test set y_true = df_test['logTransaction'] ### Removing unnecessary columns # colums that contain no data ones = unique_valued_cols(df_test) cols_to_remove = [x for x in ones if set(df_test[x].unique()) == set(['not available in demo dataset'])] cols_to_remove.extend(['hits', 'customDimensions', 'device.isMobile']) # Drop them df_test = drop_cols(df_test, list(cols_to_remove)) # Remove transaction related columns transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction'] df_test = drop_cols(df_test, transaction_cols) ### Preprocess the data before we start training # print(df_test.iloc[0]) df_test = df_test.fillna(0) df_test = label_encoding(df_test) df_test = preprocess(df_test) ### Get Predictions pred = mdl.predict(df_test) vals = []
def main(): ### Loading TRAIN Data df_train = load_df("train_v2.csv",10000) ### Loading TEST Data df_test = load_df('test_v2.csv', 10000) ### Fix outliers that do not fall within +/-3 std dev from mean of log transaction value # create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_train.logTransaction.std() mean_val = df_train.logTransaction.mean() df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction) # create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_test['logTransaction']= df_test['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_test.logTransaction.std() mean_val = df_test.logTransaction.mean() df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction) ### Extract Labels y = df_train['logTransaction'] # Get true values from test set y_true = df_test['logTransaction'] ### Removing unnecessary columns # colums that contain no data ones = unique_valued_cols(df_train) cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])] cols_to_remove.extend(['hits','customDimensions']) # Drop them from both the sets df_train = drop_cols(df_train, list(cols_to_remove)) df_test = drop_cols(df_test, list(cols_to_remove)) # Remove transaction related columns transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction'] df_train = drop_cols(df_train, transaction_cols) df_test = drop_cols(df_test, transaction_cols) # Remove extra column in training #df_train = df_train.drop('trafficSource.campaignCode', axis=1) ### Preprocess the data before we start training df_train = preprocess(df_train) df_test = preprocess(df_test) ### Create categorical and numeric features dataframe df_categorical = df_train.select_dtypes(include=['object']) df_categorical_test = df_test.select_dtypes(include=['object']) # Numeric df_numeric = df_train.select_dtypes(include=['float64', 'int64']) df_numeric_test = df_test.select_dtypes(include=['float64', 'int64']) # Label encoding on categorical df_categorical = label_encoding(df_categorical) df_categorical_test = label_encoding(df_categorical_test) # Predict on categorical lm = LinearRegression() categorize_model, RMSE_test, RMSE_train = train_and_predict(lm, df_categorical, df_categorical_test, y, y_true) print("In-sample RMSE categorical:", RMSE_train) print("Out-sample RMSE categorical:", RMSE_test) print("Model parameters: ", categorize_model.get_params()) joblib.dump(categorize_model, "modl_LR_cat.joblib") print('-'*10) # Predict on numerical df_numeric = df_numeric.fillna(0) df_numeric_test = df_numeric_test.fillna(0) num_model, RMSE_test, RMSE_train = train_and_predict(lm, df_numeric, df_numeric_test, y, y_true) print("In-sample RMSE numeric:", RMSE_train) print("Out-sample RMSE numeric:", RMSE_test) print("Model parameters: ", num_model.get_params()) joblib.dump(num_model, "modl_LR_cat.joblib") print('-'*10) # Predict on all features df_train = pd.concat([df_numeric,df_categorical],axis=1) df_test = pd.concat([df_numeric_test,df_categorical_test],axis=1) full_model, RMSE_test, RMSE_train = train_and_predict(lm, df_train, df_test, y, y_true) print("In-sample RMSE:", RMSE_train) print("Out-sample RMSE:", RMSE_test) print("Model parameters: ", full_model.get_params()) joblib.dump(full_model, "modl_LR_cat.joblib") print('-'*10)
# create a new dummy column logTransaction which is the log of all totalTransactionRevenue df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x)) std_dev = df_train.logTransaction.std() mean_val = df_train.logTransaction.mean() df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction) y = df_train['logTransaction'] # Remove colums that contain no data ones = unique_valued_cols(df_train) cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])] df_train = df_train.drop(cols_to_remove, axis=1) # Remove transaction related columns transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction'] df_train = drop_cols(df_train, transaction_cols) # Remove extra column in training df_train = df_train.drop('trafficSource.campaignCode', axis=1) ### Preprocess the data before we start training df_train = preprocess(df_train) # Get the categorical variables df_categorical = df_train.select_dtypes(include=['object']) # add logTransaction (dependent variable) column df_categorical['logTransaction'] = y # delete train set as we don't need it anymore del df_train