def test_calc_series_stats(series, norm, expected): result = calc_series_stats(series, norm=norm) assert type(result) == dict assert len(result) == len(expected) assert result.keys() == expected.keys() for key in result: assert np.isclose(result[key], expected[key]) if norm == False: np.random.seed(0) np.random.shuffle(series) result = calc_series_stats(series, norm=norm) for key in result: assert np.isclose(result[key], expected[key])
def test_calculate(self, data_loader, tickers, columns, agg_day_counts, max_back_quarter): fc = DailyAggQuarterFeatures(columns=columns, agg_day_counts=agg_day_counts, max_back_quarter=max_back_quarter) X = fc.calculate(data_loader, tickers) assert type(X) == pd.DataFrame assert 'ticker' in X.index.names assert 'date' in X.index.names assert X.shape[0] <= max_back_quarter * len(tickers) assert X.shape[1] == len(calc_series_stats([])) * \ len(columns) * len(agg_day_counts) for col in columns: for count in agg_day_counts: min_col = 'days{}_{}_min'.format(count, col) max_col = 'days{}_{}_max'.format(count, col) mean_col = 'days{}_{}_mean'.format(count, col) median_col = 'days{}_{}_median'.format(count, col) assert (X[max_col] >= X[min_col]).min() assert (X[max_col] >= X[mean_col]).min() assert (X[max_col] >= X[median_col]).min() assert (X[mean_col] >= X[min_col]).min() assert (X[median_col] >= X[min_col]).min()
def test_calculate_dayly_index(self, data, tickers, columns, agg_day_counts, max_back_quarter): # Instead of real commodities to avoid extra dataloaders commodities_codes = ['AAPL', 'MSFT'] fc = DailyAggQuarterFeatures(daily_data_key='daily', quarterly_data_key='quarterly', columns=columns, agg_day_counts=agg_day_counts, max_back_quarter=max_back_quarter, daily_index=commodities_codes) X = fc.calculate(data, tickers) assert type(X) == pd.DataFrame assert 'ticker' in X.index.names assert 'date' in X.index.names assert X.shape[0] <= max_back_quarter * len(tickers) assert X.shape[1] == len(calc_series_stats([])) * \ len(columns) * len(agg_day_counts) *\ len(commodities_codes) for code in commodities_codes: for col in columns: for count in agg_day_counts: min_col = '{}_days{}_{}_min'.format(code, count, col) max_col = '{}_days{}_{}_max'.format(code, count, col) mean_col = '{}_days{}_{}_mean'.format(code, count, col) median_col = '{}_days{}_{}_median'.format(code, count, col) assert (X[max_col] >= X[min_col]).min() assert (X[max_col] >= X[mean_col]).min() assert (X[max_col] >= X[median_col]).min() assert (X[mean_col] >= X[min_col]).min() assert (X[median_col] >= X[min_col]).min()
def test_calculate(self, data, tickers, columns, quarter_counts, max_back_quarter): fc = QuarterlyFeatures(data_key='quarterly', columns=columns, quarter_counts=quarter_counts, max_back_quarter=max_back_quarter) X = fc.calculate(data, tickers) assert type(X) == pd.DataFrame assert 'ticker' in X.index.names assert 'date' in X.index.names if type(data['quarterly']) == GenQuarterlyData: assert X.shape[0] == max_back_quarter * len(tickers) else: assert X.shape[0] <= max_back_quarter * len(tickers) assert X.shape[1] == 2 * len(calc_series_stats([])) * \ len(columns) * len(quarter_counts) # Minimum can not be lower with reduction of quarter_count sorted_quarter_counts = np.sort(quarter_counts) for col in columns: for k in range(len(sorted_quarter_counts) - 1): lower_count = sorted_quarter_counts[k] higher_count = sorted_quarter_counts[k + 1] l_col = 'quarter{}_{}_min'.format(lower_count, col) h_col = 'quarter{}_{}_min'.format(higher_count, col) assert (X[h_col] <= X[l_col]).min() # Maximum can not be higher with reduction of quarter_count sorted_quarter_counts = np.sort(quarter_counts) for col in columns: for k in range(len(sorted_quarter_counts) - 1): lower_count = sorted_quarter_counts[k] higher_count = sorted_quarter_counts[k + 1] l_col = 'quarter{}_{}_max'.format(lower_count, col) h_col = 'quarter{}_{}_max'.format(higher_count, col) assert (X[h_col] >= X[l_col]).min() std_cols = [x for x in X.columns if '_std' in x] for col in std_cols: assert X[col].min() >= 0 for col in columns: for count in quarter_counts: min_col = 'quarter{}_{}_min'.format(count, col) max_col = 'quarter{}_{}_max'.format(count, col) mean_col = 'quarter{}_{}_mean'.format(count, col) median_col = 'quarter{}_{}_median'.format(count, col) assert (X[max_col] >= X[min_col]).min() assert (X[max_col] >= X[mean_col]).min() assert (X[max_col] >= X[median_col]).min() assert (X[mean_col] >= X[min_col]).min() assert (X[median_col] >= X[min_col]).min()
def test_calc_series_stats_nans(): assert calc_series_stats([np.nan, 10, 0, 1]) == calc_series_stats([10, 0, 1]) assert calc_series_stats([None, 10, 0, 1]) == calc_series_stats([10, 0, 1]) assert calc_series_stats([10, 0, np.nan, 1]) == calc_series_stats([10, 0, 1]) result = calc_series_stats([]) for key in result: assert np.isnan(result[key]) result = calc_series_stats([np.nan, None]) for key in result: assert np.isnan(result[key])