def _get_coefficients_and_current_regressors_tickers(self, regressors_tickers: List[Ticker], positions_tickers: List[Ticker], positions_allocation: QFSeries, from_date: datetime, to_date: datetime): tickers = [*positions_tickers, *regressors_tickers] data = self._data_provider.get_price(tickers, PriceField.Close, from_date, to_date).to_simple_returns() dc = DataCleaner(data, 0.1) clean_data = dc.proxy_using_value(0) positions_returns = clean_data.reindex(columns=positions_tickers, fill_value=0) # we expect the same dim as positions_allocation series regressors_returns = clean_data.reindex(columns=regressors_tickers).dropna(axis=1, how='all') # missing regressors should be removed portfolio_returns = positions_returns.dot(positions_allocation.values) return self._get_coefficients(regressors_returns, portfolio_returns), regressors_returns.columns
def _preprocess_data(self, analysed_tms, regressors_df): """ Cleans the data before they are processed (e.g. removes regressors containing too many missing data, proxies missing data). """ self.logger.debug("Length of input timeseries: {:d} \n".format(len(analysed_tms))) data_cleaner = DataCleaner(regressors_df) common_regressors_df = data_cleaner.proxy_using_regression(analysed_tms, columns_type=SimpleReturnsSeries) common_regressors_df, common_analysed_tms = get_values_for_common_dates(common_regressors_df, analysed_tms) self.logger.debug("Length of preprocessed timeseries: {:d}".format(common_analysed_tms.size)) self.logger.debug("Number of regressors: {:d}".format(common_regressors_df.shape[1])) return common_regressors_df, common_analysed_tms
def setUp(self): self.test_dataframe = self._create_test_dataframe() self.test_benchmark = self._create_test_benchmark() self.data_cleaner = DataCleaner(self.test_dataframe)
class TestDataCleaner(TestCase): def setUp(self): self.test_dataframe = self._create_test_dataframe() self.test_benchmark = self._create_test_benchmark() self.data_cleaner = DataCleaner(self.test_dataframe) @classmethod def _create_test_dataframe(cls): values = [[np.nan, 0.0, 0.0, 0.0, 0.0], [1.0, np.nan, 1.0, 1.0, 1.0], [2.0, np.nan, np.nan, 2.0, 2.0], [3.0, 3.0, 3.0, np.nan, 3.0], [4.0, 4.0, 4.0, 4.0, 4.0], [5.0, 5.0, 5.0, 5.0, 5.0]] index = pd.date_range(start='2015-01-01', periods=6) columns = ['a', 'b', 'c', 'd', 'e'] dataframe = SimpleReturnsDataFrame(data=values, index=index, columns=columns) return dataframe @classmethod def _create_test_benchmark(cls): values = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] index = pd.date_range(start='2015-01-02', periods=6) return SimpleReturnsSeries(data=values, index=index, name='Test prices') def test_proxy_using_values(self): expected_values = [[0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0], [2.0, 0.0, 2.0, 2.0], [3.0, 3.0, 0.0, 3.0], [4.0, 4.0, 4.0, 4.0], [5.0, 5.0, 5.0, 5.0]] expected_columns = ['a', 'c', 'd', 'e'] expected_dates = self.test_dataframe.index.copy() expected_dataframe = SimpleReturnsDataFrame(data=expected_values, columns=expected_columns, index=expected_dates) self.data_cleaner.threshold = 0.2 actual_dataframe = self.data_cleaner.proxy_using_value(proxy_value=0.0) assert_dataframes_equal(expected_dataframe, actual_dataframe) def test_proxy_using_regression(self): expected_values = [[np.nan, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0], [3.0, 3.0, 3.0, 3.0], [4.0, 4.0, 4.0, 4.0], [5.0, 5.0, 5.0, 5.0]] expected_columns = ['a', 'c', 'd', 'e'] expected_dates = self.test_dataframe.index.copy() expected_dataframe = SimpleReturnsDataFrame(data=expected_values, columns=expected_columns, index=expected_dates) self.data_cleaner.threshold = 0.2 actual_dataframe = self.data_cleaner.proxy_using_regression( benchmark_tms=self.test_benchmark, columns_type=SimpleReturnsSeries) assert_dataframes_equal(expected_dataframe, actual_dataframe)