def test_imputer_1(self): index = pd.MultiIndex.from_product([['2014-01-30', '2014-02-28'], ['001', '002', '003', '004']], names=['trade_date', 'ticker']) data1 = pd.DataFrame(index=index, data=[1.0, 3.0, 3.0, np.nan, 5.0, 5.0, 6.0, 8.0]) factor_test1 = Factor(data=data1, name='test1') data2 = pd.DataFrame(index=index, data=[3.0, 2.0, 3.0, 7.0, 7.0, np.nan, 6.0, 6.0]) factor_test2 = Factor(data=data2, name='test2') data3 = pd.DataFrame(index=index, data=[3.0, 3.0, np.nan, 5.0, 6.0, 7.0, 6.0, 6.0]) factor_test3 = Factor(data=data3, name='test3') fc = FactorContainer('2014-01-30', '2014-02-28', [factor_test1, factor_test2, factor_test3]) index = pd.MultiIndex.from_product([[dt(2014, 1, 30), dt(2014, 2, 28)], ['001', '002', '003', '004']], names=['trade_date', 'ticker']) calculated = FactorImputer(numerical_strategy=NAStrategy.MOST_FREQ).fit_transform(fc) expected = pd.DataFrame({'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0], 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0], 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0]}, index=index) assert_frame_equal(calculated, expected) calculated = FactorImputer(numerical_strategy=NAStrategy.MEDIAN).fit_transform(fc) expected = pd.DataFrame({'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0], 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0], 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0]}, index=index) assert_frame_equal(calculated, expected) industry = pd.DataFrame(index=index, data=['a', 'a', 'a', 'a', 'a', 'a', np.nan, 'a']) factor_industry = Factor(data=industry, name='industry', property_dict={'type': FactorType.INDUSTY_CODE}) fc.add_factor(factor=factor_industry) calculated = FactorImputer(numerical_strategy=NAStrategy.MEDIAN, categorical_strategy=NAStrategy.CUSTOM, custom_value='other').fit_transform(fc) calculated.sort_index(axis=1, inplace=True) expected = pd.DataFrame({'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0], 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0], 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0], 'industry': ['a', 'a', 'a', 'a', 'a', 'a', 'other', 'a']}, index=index, dtype=object) assert_frame_equal(calculated, expected)
def test_imputer_2(self): index = pd.MultiIndex.from_product( [['2014-01-30', '2014-02-28'], ['001', '002', '003', '004']], names=['trade_date', 'ticker']) data = pd.DataFrame(index=index, data=[1.0, 3.0, 3.0, np.nan, 5.0, 5.0, 6.0, 8.0]) factor_test = Factor(data=data, name='test1') index = pd.MultiIndex.from_product( [[dt(2014, 1, 30), dt(2014, 2, 28)], ['001', '002', '003', '004']], names=['trade_date', 'ticker']) fi = FactorImputer(numerical_strategy=NAStrategy.MEAN) calculated = fi.fit_transform(factor_test) expected = pd.DataFrame( {'test1': [1.0, 3.0, 3.0, 2.33333333333, 5.0, 5.0, 6.0, 8.0]}, index=index) assert_frame_equal(calculated, expected) fi.set_out_container(True) calculated = fi.fit_transform(factor_test) expected = FactorContainer(start_date='2014-01-30', end_date='2014-02-28') factor = Factor(data=pd.DataFrame( {'test1': [1.0, 3.0, 3.0, 2.33333333333, 5.0, 5.0, 6.0, 8.0]}, index=index), name='test1') expected.add_factor(factor) assert (isinstance(calculated, FactorContainer)) self.assertEqual(calculated.property, expected.property) assert_frame_equal(calculated.data, expected.data)
def test_imputer_2(self): index = pd.MultiIndex.from_product([['2014-01-30', '2014-02-28'], ['001', '002', '003', '004']], names=['trade_date', 'ticker']) data = pd.DataFrame(index=index, data=[1.0, 3.0, 3.0, np.nan, 5.0, 5.0, 6.0, 8.0]) factor_test = Factor(data=data, name='test1') index = pd.MultiIndex.from_product([[dt(2014, 1, 30), dt(2014, 2, 28)], ['001', '002', '003', '004']], names=['trade_date', 'ticker']) fi = FactorImputer(numerical_strategy=NAStrategy.MEAN) calculated = fi.fit_transform(factor_test) expected = pd.DataFrame({'test1': [1.0, 3.0, 3.0, 2.33333333333, 5.0, 5.0, 6.0, 8.0]}, index=index) assert_frame_equal(calculated, expected) fi.set_out_container(True) calculated = fi.fit_transform(factor_test) expected = FactorContainer(start_date='2014-01-30', end_date='2014-02-28') factor = Factor(data=pd.DataFrame({'test1': [1.0, 3.0, 3.0, 2.33333333333, 5.0, 5.0, 6.0, 8.0]}, index=index), name='test1') expected.add_factor(factor) assert (isinstance(calculated, FactorContainer)) self.assertEqual(calculated.property, expected.property) assert_frame_equal(calculated.data, expected.data)
def test_imputer_1(self): index = pd.MultiIndex.from_product( [['2014-01-30', '2014-02-28'], ['001', '002', '003', '004']], names=['trade_date', 'ticker']) data1 = pd.DataFrame(index=index, data=[1.0, 3.0, 3.0, np.nan, 5.0, 5.0, 6.0, 8.0]) factor_test1 = Factor(data=data1, name='test1') data2 = pd.DataFrame(index=index, data=[3.0, 2.0, 3.0, 7.0, 7.0, np.nan, 6.0, 6.0]) factor_test2 = Factor(data=data2, name='test2') data3 = pd.DataFrame(index=index, data=[3.0, 3.0, np.nan, 5.0, 6.0, 7.0, 6.0, 6.0]) factor_test3 = Factor(data=data3, name='test3') fc = FactorContainer('2014-01-30', '2014-02-28', [factor_test1, factor_test2, factor_test3]) index = pd.MultiIndex.from_product( [[dt(2014, 1, 30), dt(2014, 2, 28)], ['001', '002', '003', '004']], names=['trade_date', 'ticker']) calculated = FactorImputer( numerical_strategy=NAStrategy.MOST_FREQ).fit_transform(fc) expected = pd.DataFrame( { 'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0], 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0], 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0] }, index=index) assert_frame_equal(calculated, expected) calculated = FactorImputer( numerical_strategy=NAStrategy.MEDIAN).fit_transform(fc) expected = pd.DataFrame( { 'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0], 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0], 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0] }, index=index) assert_frame_equal(calculated, expected) industry = pd.DataFrame( index=index, data=['a', 'a', 'a', 'a', 'a', 'a', np.nan, 'a']) factor_industry = Factor( data=industry, name='industry', property_dict={'type': FactorType.INDUSTY_CODE}) fc.add_factor(factor=factor_industry) calculated = FactorImputer(numerical_strategy=NAStrategy.MEDIAN, categorical_strategy=NAStrategy.CUSTOM, custom_value='other').fit_transform(fc) calculated.sort_index(axis=1, inplace=True) expected = pd.DataFrame( { 'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0], 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0], 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0], 'industry': ['a', 'a', 'a', 'a', 'a', 'a', 'other', 'a'] }, index=index, dtype=object) assert_frame_equal(calculated, expected)
data_return = load_factor_data_from_csv('return.csv') # 将数据改成未来1月收益 data_return = fwd_return(data_return) factor_return = Factor(data=data_return, name='1_Fwd_Return', property_dict={'type': FactorType.FWD_RETURN, 'norm_type': FactorNormType.Industry_Neutral}) # 创建FactorContainer实例,加载所有的因子信息 fc = FactorContainer(start_date='2014-01-01', end_date='2014-03-01', factors=[factor_mv, factor_pb, factor_return, factor_industry_code]) # pipeline # 第一步,处理极个别N/A, 有中位数替换 step_1 = ('imputer', FactorImputer(numerical_strategy=NAStrategy.MEDIAN, categorical_strategy=NAStrategy.CUSTOM, custom_value='other')) # 第二部,去极值化 step_2 = ('winsorize', FactorWinsorizer(quantile_range=(5, 95))) # 第三步,标准化 step_3 = ('std', FactorStandardizer()) # 第四步,中性化 step_4 = ('neutralize', FactorNeutralizer()) # 第五步,求因子IC step_5 = ('ic', FactorIC()) pipeline = AlphaPipeline([step_1, step_2, step_3, step_4, step_5]) ic = pipeline.fit_predict(fc)
factor_return = Factor(data=data_return, name='1_Fwd_Return', property_dict={'type': FactorType.FWD_RETURN}) # 加载行业数据(早年的wind的行业代码不太全,可能用其他数据源的数据更好,此处仅做示例用) data_industry_code = load_factor_data_from_csv('sw.csv') factor_industry_code = Factor(data=data_industry_code, name='industry_code', property_dict={'type': FactorType.INDUSTY_CODE}) # 创建FactorContainer实例,加载所有的因子信息 fc = FactorContainer(start_date='2014-01-01', end_date='2014-03-01', factors=[factor_mv, factor_pb, factor_return, factor_industry_code]) # 第一步,处理极个别N/A, 有中位数替换 fc = FactorImputer(numerical_strategy=NAStrategy.MEDIAN, categorical_strategy=NAStrategy.CUSTOM, custom_value='other', out_container=True).fit_transform(fc) # 第二部,去极值化 fc = FactorWinsorizer(quantile_range=(5, 95), out_container=True).fit_transform(fc) # 第三步,标准化 fc = FactorStandardizer(out_container=True).fit_transform(fc) # 第四步,中性化 fc = FactorNeutralizer(out_container=True).fit_transform(fc) # 第五步,求因子IC ic = FactorIC().predict(fc) print (ic)