def clean_date(df): df['day'] = to_dt(df.Date).dt.day df['month'] = df.Date.apply(lambda x: pd.to_datetime(x).month) df['year'] = df.Date.apply(lambda x: pd.to_datetime(x).year) df['quarter'] = ((df.month - 1)//3 + 1) return df
df[nm] = df[nm].apply(lambda x: math.exp(x)-1) return df def feat_importances(frst, feats): outputs = pd.DataFrame({'feats': feats, 'weight': frst.feature_importances_}) outputs = outputs.sort(columns='weight', ascending=False) print outputs ############################## Executions ################################### # Prep data all_df = clean_rossman(RAW) all_df.Sales[all_df.Sales < 0] = 0 # Create weights all_df['weights'] = 1 - 1/(to_dt(all_df.Date).astype('int')/10**18)**3 # Create list of features non_feat = ['Id', 'is_test', 'is_val', 'Sales', 'Date', 'Customers', 'Open', 'weights'] Xfeats = create_feat_list(all_df, non_feat) # Separate samples trn = all_df[(all_df.is_val == 0) & (all_df.is_test == 0) & (all_df.Open == 1)] val = all_df[(all_df.is_val == 1) & (all_df.is_test == 0) & (all_df.Sales > 0)] test = all_df[(all_df.is_test == 1)] # Run random forest num_feats = int(len(Xfeats)*.2) trees = 200 frst = Forest(n_estimators=trees, max_depth=35, max_features=4, n_jobs=-1, bootstrap=True)