def preprocessing(months, user_ids=None, featurizer=None, features=None, prefilter=True, dfs=None): ''' Takes a list of users_id, creates the relevant window from their first deposit date and featurizes within it. Args: months: Number of months ahead the frame looks from the first deposit date user_ids: List of integer user_ids to use in this sample demo_df: The demographic info to pull the user's featurizer: Optional featurizer object features: Optional list of features to use, if none it'll use every feature in the featurizer prefilter: Whether to apply prefilters such as activity threshold and rg-frame filtering dfs: The information associated with the users Returns: X: ndarray of the the featurized rows y: Labels associated with each row of X user_ids: The user_ids associated with each row of X ''' if not featurizer and not features: print("Need at least one way to get featurizing context!") raise ValueError if not dfs: demo_df, rg_df, gam_df = get_demo_df(), get_rg_df(), get_gam_df() else: demo_df, rg_df, gam_df = dfs if not user_ids: user_ids = list(demo_df.index) days = months * 30 if prefilter: print("Applying prefilters") user_ids = prefilters(user_ids, months*30, demo_df, rg_df) print(f"Constructing model with {months} months of information") print(f"Features being used: {features}") X, y = featurize(user_ids, gam_df, demo_df, featurizer=featurizer, features=features, month_window=months) return X, y, user_ids
rgs: The label associated with each row ''' print("Starting frame making") if not featurizer: featurizer = make_default_featurizer() frames = [ make_frame(user_id, gam_df, demo_df, month_window) for user_id in user_ids ] rgs = [demo_df.loc[user_id, 'rg'] == 1 for user_id in user_ids] return featurizer.vectorize(frames, features), rgs def make_frame(user_id, gam_df, demo_df, month_window): '''Featurizes a single user''' mask = (gam_df['user_id'] == user_id) user_daily = gam_df[mask] first_deposit = demo_df.loc[user_id, 'first_deposit_date'] user_frame = sparse_to_ts(user_daily, date_start=first_deposit, window=30 * month_window) return user_frame if __name__ == '__main__': demo_df = get_demo_df() gam_df = get_gam_df() rg_df = get_rg_df() user_ids = list(demo_df.index) print(len(user_ids))
return vect def add_feature(self, prod_function, feat_name=None, args={}): if not feat_name: feat_name = prod_function.__name__ self.features[feat_name] = lambda x: prod_function(x, **args) def delete_feature(self, feat_name): del self.features[feat_name] def get_feature_names(self): return list(self.features.keys()) if __name__ == "__main__": demo_df = pipeline.get_demo_df() gam_df = pipeline.get_gam_df() rg_info = pipeline.get_rg_df() user_id = 3327778 featurizer = Featurizer() featurizer.add_feature(total_hold) featurizer.add_feature(max_hold) featurizer.add_feature(weekly_hold) featurizer.add_feature(weekly_rolling_hold) mask = (gam_df['user_id'] == user_id) user_daily = daily_gam_df[mask] first_deposit = demo_df.loc[user_id, 'first_deposit_date'] user_frame = sparse_to_ts(user_daily, date_start=first_deposit, window=180) features_to_use = ["total_hold", "max_hold", "weekly_hold"]
import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib import rc, rcParams from plot_helper import * from sklearn.metrics import roc_curve import pipeline from pipeline_constants import * rcParams.update({'figure.autolayout': True}) plt.style.use('ggplot') demo_df = pipeline.get_demo_df() # Global vars weee gam_df = pipeline.get_gam_df() rg_info = pipeline.get_rg_df() def background_plot(ax, user_id, gam_df, window=30 * 6): '''Plots the introductory "Wow people lose a lot on this" graph''' mask = (gam_df['user_id'] == user_id) user_daily = gam_df[mask] first_deposit = demo_df.loc[user_id, 'first_deposit_date'] user_frame = pipeline.sparse_to_ts(user_daily, date_start=first_deposit, window=window) user_frame['cumul_hold'] = user_frame['hold'].cumsum() ax.set_title(f'User #{user_id}', fontsize=28) ax.set_xlabel("Date", fontsize=28) ax.set_ylabel("Loss (Euros)", fontsize=28) ax.tick_params(axis="y", labelsize=20) ax.tick_params(axis="x", labelsize=20)
from pipeline import get_demo_df, get_gam_df, get_rg_df from processing.features import SUMMARY_NAMES, DAILY_NAMES, WEEKLY_NAMES from processing.preprocessing import preprocessing from model import predict if __name__ == '__main__': print("Running on holdout!") sleep(10) HOLD_DEMO_PATH = 'data/holdout/demographic.csv' HOLD_RG_PATH = 'data/holdout/rg_information.csv' HOLD_GAM_PATH = 'data/holdout/gambling.csv' hold_demo = get_demo_df(HOLD_DEMO_PATH) hold_rg = get_rg_df(HOLD_RG_PATH) hold_gam = get_gam_df(HOLD_GAM_PATH) dfs = [hold_demo, hold_rg, hold_gam] #model = #features = S X, y, user_ids = preprocessing(months=months, features=features, dfs=dfs) predict(model, X, y, user_ids, store_name="holdout" store=True)