cls = RandomForestClassifier() reg = RandomForestRegressor(n_estimators=20, max_features=5, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=-1) reg = SVR(C=10., gamma=0.1) train_df_orig = RevenueCompetition.load_data() y = train_df_orig['revenue'].values del train_df_orig['revenue'] test_df_orig = RevenueCompetition.load_data(train=False) full_df = train_df_orig.append(test_df_orig) print("Transforming...") tr = RevenueTransform(rescale=True) tr.fit(full_df) X = tr.transform(train_df_orig).values print('Classify the outliers...') ly = np.log(y) ym = ly.mean() ys = ly.std() s = np.empty(ly.shape[0]) s[(ly-ym)/ys <= -2] = 0 s[np.logical_and((ly-ym)/ys > -2,(ly-ym)/ys <= -1)] = 1 s[np.logical_and((ly-ym)/ys > -1,(ly-ym)/ys <= 1)] = 2 s[np.logical_and((ly-ym)/ys > 1,(ly-ym)/ys <= 2)] = 3 s[(ly-ym)/ys > 2] = 4 train_index, valid_index = list(StratifiedShuffleSplit(s, n_iter=1, train_size=train_size, random_state=0))[0]
train_size = 0.75 cls = RandomForestClassifier() reg = RandomForestRegressor(n_estimators=20, max_features=5, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=-1) train_df_orig = RevenueCompetition.load_data() y = train_df_orig['revenue'].values del train_df_orig['revenue'] test_df_orig = RevenueCompetition.load_data(train=False) full_df = train_df_orig.append(test_df_orig) print("Transforming...") tr = RevenueTransform(rescale=False) tr.fit(full_df) X = tr.transform(train_df_orig).values print('Classify the outliers...') ly = np.log(y) ym = ly.mean() ys = ly.std() s = np.empty(ly.shape[0]) s[(ly-ym)/ys <= -2] = 0 s[np.logical_and((ly-ym)/ys > -2,(ly-ym)/ys <= -1)] = 1 s[np.logical_and((ly-ym)/ys > -1,(ly-ym)/ys <= 1)] = 2 s[np.logical_and((ly-ym)/ys > 1,(ly-ym)/ys <= 2)] = 3 s[(ly-ym)/ys > 2] = 4 train_index, valid_index = list(StratifiedShuffleSplit(s, n_iter=1, train_size=train_size, random_state=0))[0]