Ejemplo n.º 1
0
def clean_date(df):
    df['day'] = to_dt(df.Date).dt.day
    df['month'] = df.Date.apply(lambda x: pd.to_datetime(x).month)
    df['year'] = df.Date.apply(lambda x: pd.to_datetime(x).year)
    df['quarter'] = ((df.month - 1)//3 + 1)
    return df
Ejemplo n.º 2
0
    df[nm] = df[nm].apply(lambda x: math.exp(x)-1)
    return df

def feat_importances(frst, feats):
    outputs = pd.DataFrame({'feats': feats,
                            'weight': frst.feature_importances_})
    outputs = outputs.sort(columns='weight', ascending=False)
    print outputs

############################## Executions ###################################
# Prep data
all_df = clean_rossman(RAW)
all_df.Sales[all_df.Sales < 0] = 0

# Create weights
all_df['weights'] = 1 - 1/(to_dt(all_df.Date).astype('int')/10**18)**3

# Create list of features
non_feat = ['Id', 'is_test', 'is_val', 'Sales', 'Date', 'Customers', 'Open',
            'weights']
Xfeats = create_feat_list(all_df, non_feat)

# Separate samples
trn = all_df[(all_df.is_val == 0) & (all_df.is_test == 0) & (all_df.Open == 1)]
val = all_df[(all_df.is_val == 1) & (all_df.is_test == 0) & (all_df.Sales > 0)]
test = all_df[(all_df.is_test == 1)]

# Run random forest
num_feats = int(len(Xfeats)*.2)
trees = 200
frst = Forest(n_estimators=trees, max_depth=35, max_features=4, n_jobs=-1, bootstrap=True)