Esempio n. 1
0
    def test_imputer_1(self):
        index = pd.MultiIndex.from_product([['2014-01-30', '2014-02-28'], ['001', '002', '003', '004']],
                                           names=['trade_date', 'ticker'])
        data1 = pd.DataFrame(index=index, data=[1.0, 3.0, 3.0, np.nan, 5.0, 5.0, 6.0, 8.0])
        factor_test1 = Factor(data=data1, name='test1')

        data2 = pd.DataFrame(index=index, data=[3.0, 2.0, 3.0, 7.0, 7.0, np.nan, 6.0, 6.0])
        factor_test2 = Factor(data=data2, name='test2')

        data3 = pd.DataFrame(index=index, data=[3.0, 3.0, np.nan, 5.0, 6.0, 7.0, 6.0, 6.0])
        factor_test3 = Factor(data=data3, name='test3')

        fc = FactorContainer('2014-01-30', '2014-02-28', [factor_test1, factor_test2, factor_test3])

        index = pd.MultiIndex.from_product([[dt(2014, 1, 30), dt(2014, 2, 28)], ['001', '002', '003', '004']],
                                           names=['trade_date', 'ticker'])
        calculated = FactorImputer(numerical_strategy=NAStrategy.MOST_FREQ).fit_transform(fc)
        expected = pd.DataFrame({'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0],
                                 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0],
                                 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0]},
                                index=index)
        assert_frame_equal(calculated, expected)

        calculated = FactorImputer(numerical_strategy=NAStrategy.MEDIAN).fit_transform(fc)
        expected = pd.DataFrame({'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0],
                                 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0],
                                 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0]},
                                index=index)
        assert_frame_equal(calculated, expected)

        industry = pd.DataFrame(index=index, data=['a', 'a', 'a', 'a', 'a', 'a', np.nan, 'a'])
        factor_industry = Factor(data=industry, name='industry', property_dict={'type': FactorType.INDUSTY_CODE})
        fc.add_factor(factor=factor_industry)
        calculated = FactorImputer(numerical_strategy=NAStrategy.MEDIAN,
                                   categorical_strategy=NAStrategy.CUSTOM,
                                   custom_value='other').fit_transform(fc)
        calculated.sort_index(axis=1, inplace=True)
        expected = pd.DataFrame({'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0],
                                 'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0],
                                 'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0],
                                 'industry': ['a', 'a', 'a', 'a', 'a', 'a', 'other', 'a']},
                                index=index,
                                dtype=object)
        assert_frame_equal(calculated, expected)
Esempio n. 2
0
    def test_imputer_2(self):
        index = pd.MultiIndex.from_product(
            [['2014-01-30', '2014-02-28'], ['001', '002', '003', '004']],
            names=['trade_date', 'ticker'])
        data = pd.DataFrame(index=index,
                            data=[1.0, 3.0, 3.0, np.nan, 5.0, 5.0, 6.0, 8.0])
        factor_test = Factor(data=data, name='test1')

        index = pd.MultiIndex.from_product(
            [[dt(2014, 1, 30), dt(2014, 2, 28)], ['001', '002', '003', '004']],
            names=['trade_date', 'ticker'])
        fi = FactorImputer(numerical_strategy=NAStrategy.MEAN)
        calculated = fi.fit_transform(factor_test)
        expected = pd.DataFrame(
            {'test1': [1.0, 3.0, 3.0, 2.33333333333, 5.0, 5.0, 6.0, 8.0]},
            index=index)
        assert_frame_equal(calculated, expected)

        fi.set_out_container(True)
        calculated = fi.fit_transform(factor_test)
        expected = FactorContainer(start_date='2014-01-30',
                                   end_date='2014-02-28')
        factor = Factor(data=pd.DataFrame(
            {'test1': [1.0, 3.0, 3.0, 2.33333333333, 5.0, 5.0, 6.0, 8.0]},
            index=index),
                        name='test1')
        expected.add_factor(factor)

        assert (isinstance(calculated, FactorContainer))
        self.assertEqual(calculated.property, expected.property)
        assert_frame_equal(calculated.data, expected.data)
Esempio n. 3
0
    def test_imputer_2(self):
        index = pd.MultiIndex.from_product([['2014-01-30', '2014-02-28'], ['001', '002', '003', '004']],
                                           names=['trade_date', 'ticker'])
        data = pd.DataFrame(index=index, data=[1.0, 3.0, 3.0, np.nan, 5.0, 5.0, 6.0, 8.0])
        factor_test = Factor(data=data, name='test1')

        index = pd.MultiIndex.from_product([[dt(2014, 1, 30), dt(2014, 2, 28)], ['001', '002', '003', '004']],
                                           names=['trade_date', 'ticker'])
        fi = FactorImputer(numerical_strategy=NAStrategy.MEAN)
        calculated = fi.fit_transform(factor_test)
        expected = pd.DataFrame({'test1': [1.0, 3.0, 3.0, 2.33333333333, 5.0, 5.0, 6.0, 8.0]}, index=index)
        assert_frame_equal(calculated, expected)

        fi.set_out_container(True)
        calculated = fi.fit_transform(factor_test)
        expected = FactorContainer(start_date='2014-01-30',
                                   end_date='2014-02-28')
        factor = Factor(data=pd.DataFrame({'test1': [1.0, 3.0, 3.0, 2.33333333333, 5.0, 5.0, 6.0, 8.0]}, index=index),
                        name='test1')
        expected.add_factor(factor)

        assert (isinstance(calculated, FactorContainer))
        self.assertEqual(calculated.property, expected.property)
        assert_frame_equal(calculated.data, expected.data)
Esempio n. 4
0
    def test_imputer_1(self):
        index = pd.MultiIndex.from_product(
            [['2014-01-30', '2014-02-28'], ['001', '002', '003', '004']],
            names=['trade_date', 'ticker'])
        data1 = pd.DataFrame(index=index,
                             data=[1.0, 3.0, 3.0, np.nan, 5.0, 5.0, 6.0, 8.0])
        factor_test1 = Factor(data=data1, name='test1')

        data2 = pd.DataFrame(index=index,
                             data=[3.0, 2.0, 3.0, 7.0, 7.0, np.nan, 6.0, 6.0])
        factor_test2 = Factor(data=data2, name='test2')

        data3 = pd.DataFrame(index=index,
                             data=[3.0, 3.0, np.nan, 5.0, 6.0, 7.0, 6.0, 6.0])
        factor_test3 = Factor(data=data3, name='test3')

        fc = FactorContainer('2014-01-30', '2014-02-28',
                             [factor_test1, factor_test2, factor_test3])

        index = pd.MultiIndex.from_product(
            [[dt(2014, 1, 30), dt(2014, 2, 28)], ['001', '002', '003', '004']],
            names=['trade_date', 'ticker'])
        calculated = FactorImputer(
            numerical_strategy=NAStrategy.MOST_FREQ).fit_transform(fc)
        expected = pd.DataFrame(
            {
                'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0],
                'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0],
                'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0]
            },
            index=index)
        assert_frame_equal(calculated, expected)

        calculated = FactorImputer(
            numerical_strategy=NAStrategy.MEDIAN).fit_transform(fc)
        expected = pd.DataFrame(
            {
                'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0],
                'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0],
                'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0]
            },
            index=index)
        assert_frame_equal(calculated, expected)

        industry = pd.DataFrame(
            index=index, data=['a', 'a', 'a', 'a', 'a', 'a', np.nan, 'a'])
        factor_industry = Factor(
            data=industry,
            name='industry',
            property_dict={'type': FactorType.INDUSTY_CODE})
        fc.add_factor(factor=factor_industry)
        calculated = FactorImputer(numerical_strategy=NAStrategy.MEDIAN,
                                   categorical_strategy=NAStrategy.CUSTOM,
                                   custom_value='other').fit_transform(fc)
        calculated.sort_index(axis=1, inplace=True)
        expected = pd.DataFrame(
            {
                'test1': [1.0, 3.0, 3.0, 3.0, 5.0, 5.0, 6.0, 8.0],
                'test2': [3.0, 2.0, 3.0, 7.0, 7.0, 6.0, 6.0, 6.0],
                'test3': [3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 6.0, 6.0],
                'industry': ['a', 'a', 'a', 'a', 'a', 'a', 'other', 'a']
            },
            index=index,
            dtype=object)
        assert_frame_equal(calculated, expected)
Esempio n. 5
0
data_return = load_factor_data_from_csv('return.csv')
# 将数据改成未来1月收益
data_return = fwd_return(data_return)
factor_return = Factor(data=data_return,
                       name='1_Fwd_Return',
                       property_dict={'type': FactorType.FWD_RETURN, 'norm_type': FactorNormType.Industry_Neutral})

# 创建FactorContainer实例,加载所有的因子信息
fc = FactorContainer(start_date='2014-01-01',
                     end_date='2014-03-01',
                     factors=[factor_mv, factor_pb, factor_return, factor_industry_code])

# pipeline
# 第一步,处理极个别N/A, 有中位数替换
step_1 = ('imputer', FactorImputer(numerical_strategy=NAStrategy.MEDIAN,
                                   categorical_strategy=NAStrategy.CUSTOM,
                                   custom_value='other'))
# 第二部,去极值化
step_2 = ('winsorize', FactorWinsorizer(quantile_range=(5, 95)))

# 第三步,标准化
step_3 = ('std', FactorStandardizer())

# 第四步,中性化
step_4 = ('neutralize', FactorNeutralizer())

# 第五步,求因子IC
step_5 = ('ic', FactorIC())

pipeline = AlphaPipeline([step_1, step_2, step_3, step_4, step_5])
ic = pipeline.fit_predict(fc)
Esempio n. 6
0
factor_return = Factor(data=data_return, name='1_Fwd_Return', property_dict={'type': FactorType.FWD_RETURN})

# 加载行业数据(早年的wind的行业代码不太全,可能用其他数据源的数据更好,此处仅做示例用)
data_industry_code = load_factor_data_from_csv('sw.csv')
factor_industry_code = Factor(data=data_industry_code,
                              name='industry_code',
                              property_dict={'type': FactorType.INDUSTY_CODE})

# 创建FactorContainer实例,加载所有的因子信息
fc = FactorContainer(start_date='2014-01-01',
                     end_date='2014-03-01',
                     factors=[factor_mv, factor_pb, factor_return, factor_industry_code])

# 第一步,处理极个别N/A, 有中位数替换
fc = FactorImputer(numerical_strategy=NAStrategy.MEDIAN,
                   categorical_strategy=NAStrategy.CUSTOM,
                   custom_value='other',
                   out_container=True).fit_transform(fc)

# 第二部,去极值化
fc = FactorWinsorizer(quantile_range=(5, 95),
                      out_container=True).fit_transform(fc)

# 第三步,标准化
fc = FactorStandardizer(out_container=True).fit_transform(fc)

# 第四步,中性化
fc = FactorNeutralizer(out_container=True).fit_transform(fc)

# 第五步,求因子IC
ic = FactorIC().predict(fc)
print (ic)