def run_final_models(df, continuous_vars, categorical_vars, outcome_var, train_indicator=""): continuous_prediction = run_continuous_categorical_models(df, continuous_vars, categorical_vars, outcome_var, train_indicator) binary_prediction = run_binary_categorical_models(df, continuous_vars, categorical_vars, outcome_var, train_indicator) svm_prediction = run_svm_prediction(df, continuous_vars, categorical_vars, outcome_var, train_indicator) predictions = np.mean([continuous_prediction, binary_prediction, svm_prediction], axis=0) predictions_df = pd.DataFrame({'bidder_id': df.index[df[train_indicator] == False], 'prediction': predictions}) predictions_df.to_csv(derived_data_path + 'predictions.csv', index=False) encode_predictions(derived_data_path, 'SoVeryClassy') return predictions
def facebook_modelling(derived_data_path): store = pd.HDFStore(derived_data_path + "data.h5") train = store['train'] test = store['test'] # set indices to merge train.set_index("bidder_id", inplace=True) test.set_index("bidder_id", inplace=True) store.close() store = pd.HDFStore(derived_data_path + "derived_data.h5") bidder = store['bidder_level'].join(pd.concat([train, test]), how="inner") store.close() # Merge everything. Only keeping inner (will need to later remerge on original files) # bidder = bidder_level.join(pd.concat([train, test]), how="inner") # indicator for whether in train or test dataset bidder['train'] = ~bidder['outcome'].isnull() bidder['test'] = ~bidder['train'] for var in ['most_freq_country', 'most_freq_merchandise']: encoder = LabelEncoder().fit(bidder[var]) bidder[var] = encoder.transform(bidder[var]) # bidder = bidder[bidder.train == True] #To create: # num_bids/num_auctions bidder['bids_per_auction'] = bidder['num_bids'] / (bidder['num_auctions']+1) # num_bids/length_activity bidder['bids_per_time'] = bidder['num_bids'] / (bidder['length_activity']+1) # recode the dummies for tiny groups threshold = len(bidder)/100 vars_to_recode = ['most_freq_device', 'most_freq_merchandise'] for var in vars_to_recode: num = bidder[var].value_counts() small_val = num[num < threshold].index bidder.loc[bidder[var].apply(lambda x: x in small_val), var] = -1 bidder.std_change_time.fillna(-1, inplace=True) bidder.mean_change_time.fillna(-1, inplace=True) # Model for each time period? # // 10 for 1st-3rd ip # in period 1 # in period 2 # in period 3 # off-peak or peak times # change time # Dummy for more than one for all of the max continuous_vars = ['num_auctions', 'num_bids', 'num_url', 'num_country', 'num_device', 'num_merchandise', 'num_ip', 'num_first_bid', 'pct_first_bid', 'num_winner', 'pct_winner', 'num_outbid_self', 'num_outbid_self_diff_time', 'first_bid', 'last_bid', 'length_activity', 'num_periods', 'max_simul_actions', 'max_simul_auction', 'max_simul_country', 'max_simul_device', 'max_simul_ip', 'bids_per_auction', 'bids_per_time', 'most_common_auction_type', 'periods_seen', 'mean_change_time', 'std_change_time', 'std_pct_auction_bids', 'mean_pct_auction_bids', 'mean_num_bidders_auction', 'mean_num_bids_auction', 'mean_bids_timestamp', 'std_bids_timestamp'] categorical_vars = ['most_freq_device', 'primary_period', 'most_freq_merchandise', 'most_freq_country'] predictor_vars = continuous_vars + categorical_vars outcome_var = 'outcome' df = bidder[bidder.train == True] X_train, y_train, X_test, y_test = test_train_data(df, continuous_vars, categorical_vars, outcome_var, categorical='binary', seed = 56, test_pct=0) test = [pd.get_dummies(df.pop(var)) for var in categorical_vars] run_random_forest_model(bidder[bidder.train == True], continuous_vars, categorical_vars, outcome_var) run_random_forest_model(bidder, predictor_vars, outcome_var, 999) generate_predictions(bidder, continuous_vars, categorical_vars, outcome_var, 'train', derived_data_path) encode_predictions(derived_data_path, 'that was unexpected binary gbm')