def test_loading(): X, y = load_data() assert 'cnt' not in X.columns assert 'registered' not in X.columns assert 'casual' not in X.columns assert set(y.columns) == {'cnt', 'registered', 'casual'} assert (X.index == y.index).all()
def test_featuretransformer(): X, y = load_data() ft = FeatureTransformer(remove_year=True, categorical=False) X_tr = ft.fit_transform(X) assert len(X_tr.columns) == 11 assert len(X_tr) == len(X) assert (X.index == X_tr.index).all() ft = FeatureTransformer(remove_year=False, categorical=False) X_tr = ft.fit_transform(X) assert len(X_tr.columns) == 12 assert len(X_tr) == len(X) assert (X.index == X_tr.index).all() ft = FeatureTransformer(remove_year=True, categorical=True) X_tr = ft.fit_transform(X) assert len(X_tr.columns) == 57 assert len(X_tr) == len(X) assert (X.index == X_tr.index).all() ft = FeatureTransformer(remove_year=False, categorical=True) X_tr = ft.fit_transform(X) assert len(X_tr.columns) == 58 assert len(X_tr) == len(X) assert (X.index == X_tr.index).all()
def get_periodical_testset(): X, y = load_data() y_cnt = y['cnt'] trend = np.linspace(1, 2, len(y_cnt)) periodical = 2000 * (np.abs(np.sin( np.linspace(0, 1, len(y_cnt)) * 2 * np.pi)) + 1) y_cnt[:] = trend * periodical return y_cnt, periodical, trend
def test_trendremover_real(): X, y = load_data() y = y['cnt'] trend_remover = TrendRemover(remove_trend=True) y_trans = np.array(trend_remover.fit_transform(y)) # fig, ax = plt.subplots(1, 1) # ax.plot(np.linspace(1,2,len(y)), np.array(y).ravel(), color='black') # ax.plot(np.linspace(1,2,len(y)), np.array(y_trans).ravel(), color='grey') # plt.show() assert np.std(y_trans) < 0.5*np.std(y)
def test_bikeshareregression(): X, y = load_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) ft = FeatureTransformer() X_train = ft.fit_transform(X_train) pr = TrendRemover(remove_trend=True).fit(y_train['cnt']) bsr = BikeshareRegression(trend_remover=pr, random_state=42).fit(X_train, y_train['cnt']) X_test = ft.transform(X_test) y_pred = bsr.predict(X_test) # test default regressor, this isn't the best possible result. mae = mean_absolute_error(y_test['cnt'], y_pred) assert mae < 45
def plot_grouped_usage(best_clf_casual, best_clf_registered, data_dir, plot_output_dir): X, y = load_data(data_dir) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) y_pred_casual = best_clf_casual.predict(X_test) y_pred_registered = best_clf_registered.predict(X_test) y_pred = y_test.copy() y_pred['registered'] = np.array(y_pred_registered) y_pred['casual'] = np.array(y_pred_casual) for dataset, y_plots, y_names in zip( ['alldata', 'predicttest'], [[y], [y_pred, y_test]], [['all'], ['prediction', 'test set']]): for time_interval in ['H', 'D', 'M']: fig, ax = plt.subplots(1, 1) for y_plot, y_name in zip(y_plots, y_names): for users, c in zip(['registered', 'casual'], ['steelblue', 'crimson']): y_cnt = y_plot[users] if time_interval == 'H': y_grouped = y_cnt.groupby(y_cnt.index.hour) x_grouped = np.unique(y_cnt.index.hour) elif time_interval == 'D': y_grouped = y_cnt.groupby(y_cnt.index.weekday) x_grouped = np.unique(y_cnt.index.weekday) elif time_interval == 'M': y_grouped = y_cnt.groupby(y_cnt.index.month) x_grouped = np.unique(y_cnt.index.month) y_mean = y_grouped.mean() y_sem = y_grouped.aggregate(lambda g: sem(g, axis=None)) if y_name == 'prediction': ls = 'dashed' label = f"pred. {users}" else: ls = 'solid' label = users ax.plot(x_grouped, y_mean, lw=2, color=c, linestyle=ls, label=label) ax.fill_between(x_grouped, y_mean - y_sem, y_mean + y_sem, color=c, lw=0, alpha=0.2, label=None) xticks = list( np.arange(x_grouped.min(), x_grouped.max() + 1, 1)) if time_interval == 'H': xticklabels = [ f"${xt}$" if i % 2 == 0 else "" for i, xt in enumerate(xticks) ] else: xticklabels = [f"${xt}$" for xt in xticks] ax.set_xticks(xticks) ax.set_xticklabels(xticklabels) ax.set_xlabel(time_interval) ax.set_ylabel("Mean No. of rentals per hour") ax.legend(frameon=False) fig.savefig(os.path.join( plot_output_dir, f'mean_usage_{dataset}_{time_interval}.pdf'), bbox_inches='tight')
def plot_grouped_usage_bias(best_clf_casual, best_clf_registered, data_dir, plot_output_dir): X, y = load_data(data_dir) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) y_pred_casual = best_clf_casual.predict(X_test) y_pred_registered = best_clf_registered.predict(X_test) y_pred = y_test.copy() y_pred['registered'] = np.array(y_pred_registered) y_pred['casual'] = np.array(y_pred_casual) for time_interval in ['H', 'D', 'M']: fig, ax = plt.subplots(1, 1) for user, c in zip(['registered', 'casual'], ['steelblue', 'crimson']): dy = y_pred[user] - y_test[user] for evl, ls in zip(['bias', 'mad'], ['dashed', 'solid']): if evl == 'mad': dy = dy.abs() if time_interval == 'H': dy_gr = dy.groupby(dy.index.hour) x_gr = np.unique(dy.index.hour) elif time_interval == 'D': dy_gr = dy.groupby(dy.index.weekday) x_gr = np.unique(dy.index.weekday) elif time_interval == 'M': dy_gr = dy.groupby(dy.index.month) x_gr = np.unique(dy.index.month) dy_mean = dy_gr.mean() dy_sem = dy_gr.aggregate(lambda g: sem(g, axis=None)) ax.plot(x_gr, dy_mean, lw=2, color=c, linestyle=ls, label=f"{evl} {user}") ax.fill_between(x_gr, dy_mean - dy_sem, dy_mean + dy_sem, color=c, lw=0, alpha=0.2, label=None) xticks = list(np.arange(x_gr.min(), x_gr.max() + 1, 1)) if time_interval == 'H': xticklabels = [ f"${xt}$" if i % 2 == 0 else "" for i, xt in enumerate(xticks) ] else: xticklabels = [f"${xt}$" for xt in xticks] ax.axhline(y=0, c='k', ls=':', lw=1.5) ax.set_xticks(xticks) ax.set_xticklabels(xticklabels) ax.set_xlabel(time_interval) ax.set_ylabel("Mean bias / mad per hour") ax.set_ylim(-40, 100) ax.legend(loc=2, frameon=False) fig.savefig(os.path.join(plot_output_dir, f'bias_mad_{time_interval}.pdf'), bbox_inches='tight')