Beispiel #1
0
 def __init__(self,
              alpha_model,
              data_meta,
              universe,
              start_date,
              end_date,
              freq,
              benchmark=905,
              industry_cat='sw_adj',
              industry_level=1,
              dask_client=None):
     self.alpha_model = alpha_model
     self.data_meta = data_meta
     self.universe = universe
     self.benchmark = benchmark
     self.dates = makeSchedule(start_date, end_date, freq, 'china.sse')
     self.dates = [d.strftime('%Y-%m-%d') for d in self.dates]
     self.industry_cat = industry_cat
     self.industry_level = industry_level
     self.freq = freq
     self.horizon = map_freq(freq)
     self.engine = SqlEngine(self.data_meta.data_source)
     self.dask_client = dask_client
     self.total_data = None
     self.index_return = None
     self.risk_models = None
     self.alpha_models = None
Beispiel #2
0
 def __init__(self,
              alpha_model,
              data_meta,
              running_setting,
              dask_client=None):
     self.alpha_model = alpha_model
     self.data_meta = data_meta
     self.running_setting = running_setting
     self.engine = SqlEngine(self.data_meta.data_source)
     self.dask_client = dask_client
Beispiel #3
0
def prepare_data(engine: SqlEngine,
                 factors: Union[Transformer, Iterable[object]],
                 start_date: str,
                 end_date: str,
                 frequency: str,
                 universe: Universe,
                 benchmark: int,
                 warm_start: int = 0):
    if warm_start > 0:
        p = Period(frequency)
        p = Period(length=-warm_start * p.length(), units=p.units())
        start_date = advanceDateByCalendar('china.sse', start_date,
                                           p).strftime('%Y-%m-%d')

    dates = makeSchedule(start_date,
                         end_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Forward)

    dates = [d.strftime('%Y-%m-%d') for d in dates]

    horizon = map_freq(frequency)

    if isinstance(factors, Transformer):
        transformer = factors
    else:
        transformer = Transformer(factors)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).sort_values(
                                              ['trade_date', 'code'])
    alpha_logger.info("factor data loading finished")
    return_df = engine.fetch_dx_return_range(universe,
                                             dates=dates,
                                             horizon=horizon)
    alpha_logger.info("return data loading finished")
    industry_df = engine.fetch_industry_range(universe, dates=dates)
    alpha_logger.info("industry data loading finished")
    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
    alpha_logger.info("benchmark data loading finished")

    df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
    df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
    df = pd.merge(df, industry_df, on=['trade_date', 'code'])
    df['weight'] = df['weight'].fillna(0.)

    return dates, df[['trade_date', 'code', 'dx']], df[[
        'trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry'
    ] + transformer.names]
Beispiel #4
0
 def fetch_train_data(self, ref_date, alpha_model: ModelBase):
     return fetch_train_phase(SqlEngine(self.data_source),
                              alpha_model.formulas,
                              ref_date,
                              self.freq,
                              self.universe,
                              self.batch,
                              self.neutralized_risk,
                              self.risk_model,
                              self.pre_process,
                              self.post_process,
                              self.warm_start,
                              fit_target=alpha_model.fit_target)
Beispiel #5
0
 def __init__(self,
              freq: str,
              universe: Universe,
              batch: int,
              neutralized_risk: Iterable[str] = None,
              risk_model: str = 'short',
              pre_process: Iterable[object] = None,
              post_process: Iterable[object] = None,
              warm_start: int = 0,
              data_source: str = None):
     self.data_source = data_source
     self.engine = SqlEngine(self.data_source)
     self.freq = freq
     self.universe = universe
     self.batch = batch
     self.neutralized_risk = neutralized_risk
     self.risk_model = risk_model
     self.pre_process = _map_process(pre_process)
     self.post_process = _map_process(post_process)
     self.warm_start = warm_start
Beispiel #6
0
 def setUp(self):
     self.engine = SqlEngine(DATA_ENGINE_URI)
     dates_list = bizDatesList('china.sse', '2010-10-01', '2018-04-27')
     self.ref_date = random.choice(dates_list).strftime('%Y-%m-%d')
     alpha_logger.info("Test date: {0}".format(self.ref_date))
Beispiel #7
0
class TestSqlEngine(unittest.TestCase):
    def setUp(self):
        self.engine = SqlEngine(DATA_ENGINE_URI)
        dates_list = bizDatesList('china.sse', '2010-10-01', '2018-04-27')
        self.ref_date = random.choice(dates_list).strftime('%Y-%m-%d')
        alpha_logger.info("Test date: {0}".format(self.ref_date))

    def test_sql_engine_fetch_codes(self):
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)

        query = select([UniverseTable.code]).where(
            and_(UniverseTable.trade_date == ref_date,
                 or_(UniverseTable.zz500 == 1, UniverseTable.zz1000 == 1)))

        df = pd.read_sql(query, con=self.engine.engine).sort_values('code')
        self.assertListEqual(codes, list(df.code.values))

    def test_sql_engine_fetch_codes_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes_range(universe, dates=ref_dates)

        query = select([UniverseTable.trade_date, UniverseTable.code]).where(
            and_(UniverseTable.trade_date.in_(ref_dates),
                 or_(UniverseTable.zz500 == 1, UniverseTable.zz1000 == 1)))

        df = pd.read_sql(query, con=self.engine.engine).sort_values('code')

        for ref_date in ref_dates:
            calculated_codes = list(
                sorted(codes[codes.trade_date == ref_date].code.values))
            expected_codes = list(
                sorted(df[df.trade_date == ref_date].code.values))
            self.assertListEqual(calculated_codes, expected_codes)

    def test_sdl_engine_fetch_codes_with_exclude_universe(self):
        ref_date = self.ref_date
        universe = Universe('zz500') - Universe('cyb')
        codes = self.engine.fetch_codes(ref_date, universe)

        query = select([UniverseTable.code]).where(
            and_(UniverseTable.trade_date == ref_date,
                 UniverseTable.zz500 == 1, UniverseTable.cyb == 0))

        df = pd.read_sql(query, con=self.engine.engine).sort_values('code')
        self.assertListEqual(codes, list(df.code.values))

    def test_sql_engine_fetch_dx_return(self):
        horizon = 4
        offset = 1
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)

        dx_return = self.engine.fetch_dx_return(ref_date,
                                                codes,
                                                horizon=horizon,
                                                offset=offset)
        start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
        end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

        query = select([Market.code, Market.chgPct]).where(
            and_(Market.trade_date.between(start_date, end_date),
                 Market.code.in_(dx_return.code.unique().tolist())))

        df = pd.read_sql(query, con=self.engine.engine)
        res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())
        np.testing.assert_array_almost_equal(dx_return.dx.values,
                                             res.chgPct.values)

        horizon = 4
        offset = 0
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)

        dx_return = self.engine.fetch_dx_return(ref_date,
                                                codes,
                                                horizon=horizon,
                                                offset=offset)
        start_date = advanceDateByCalendar('china.sse', ref_date, '1b')
        end_date = advanceDateByCalendar('china.sse', ref_date, '5b')

        query = select([Market.code, Market.chgPct]).where(
            and_(Market.trade_date.between(start_date, end_date),
                 Market.code.in_(dx_return.code.unique().tolist())))

        df = pd.read_sql(query, con=self.engine.engine)
        res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())
        np.testing.assert_array_almost_equal(dx_return.dx.values,
                                             res.chgPct.values)

    def test_sql_engine_fetch_dx_return_with_benchmark(self):
        horizon = 4
        offset = 1
        benchmark = 300
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)

        dx_return = self.engine.fetch_dx_return(ref_date,
                                                codes,
                                                horizon=horizon,
                                                offset=offset,
                                                benchmark=benchmark)
        start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
        end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

        query = select([Market.code, Market.chgPct]).where(
            and_(Market.trade_date.between(start_date, end_date),
                 Market.code.in_(dx_return.code.unique().tolist())))

        df = pd.read_sql(query, con=self.engine.engine)
        res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())

        query = select(
            [IndexMarket.indexCode.label('code'), IndexMarket.chgPct]).where(
                and_(IndexMarket.trade_date.between(start_date, end_date),
                     IndexMarket.indexCode == benchmark))

        df = pd.read_sql(query, con=self.engine.engine)
        b_res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())
        np.testing.assert_array_almost_equal(
            dx_return.dx.values, res.chgPct.values - b_res.chgPct.values)

        horizon = 4
        offset = 0
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)

        dx_return = self.engine.fetch_dx_return(ref_date,
                                                codes,
                                                horizon=horizon,
                                                offset=offset,
                                                benchmark=benchmark)
        start_date = advanceDateByCalendar('china.sse', ref_date, '1b')
        end_date = advanceDateByCalendar('china.sse', ref_date, '5b')

        query = select([Market.code, Market.chgPct]).where(
            and_(Market.trade_date.between(start_date, end_date),
                 Market.code.in_(dx_return.code.unique().tolist())))

        df = pd.read_sql(query, con=self.engine.engine)
        res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())

        query = select(
            [IndexMarket.indexCode.label('code'), IndexMarket.chgPct]).where(
                and_(IndexMarket.trade_date.between(start_date, end_date),
                     IndexMarket.indexCode == benchmark))

        df = pd.read_sql(query, con=self.engine.engine)
        b_res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())
        np.testing.assert_array_almost_equal(
            dx_return.dx.values, res.chgPct.values - b_res.chgPct.values)

    def test_sql_engine_fetch_dx_return_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        universe = Universe('zz500') + Universe('zz1000')

        dx_return = self.engine.fetch_dx_return_range(universe,
                                                      dates=ref_dates,
                                                      horizon=4,
                                                      offset=1)

        codes = self.engine.fetch_codes_range(universe, dates=ref_dates)
        groups = codes.groupby('trade_date')

        for ref_date, g in groups:
            start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
            end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

            query = select([Market.code, Market.chgPct]).where(
                and_(Market.trade_date.between(start_date, end_date),
                     Market.code.in_(g.code.unique().tolist())))

            df = pd.read_sql(query, con=self.engine.engine)
            res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())
            calculated_return = dx_return[dx_return.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_return.dx.values,
                                                 res.chgPct.values)

    def test_sql_engine_fetch_dx_return_range_with_benchmark(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        universe = Universe('zz500') + Universe('zz1000')
        benchmark = 300

        dx_return = self.engine.fetch_dx_return_range(universe,
                                                      dates=ref_dates,
                                                      horizon=4,
                                                      offset=1,
                                                      benchmark=benchmark)

        codes = self.engine.fetch_codes_range(universe, dates=ref_dates)
        groups = codes.groupby('trade_date')

        for ref_date, g in groups:
            start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
            end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

            query = select([Market.code, Market.chgPct]).where(
                and_(Market.trade_date.between(start_date, end_date),
                     Market.code.in_(g.code.unique().tolist())))

            df = pd.read_sql(query, con=self.engine.engine)
            res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())

            query = select(
                [IndexMarket.indexCode.label('code'),
                 IndexMarket.chgPct]).where(
                     and_(IndexMarket.trade_date.between(start_date, end_date),
                          IndexMarket.indexCode == benchmark))

            df = pd.read_sql(query, con=self.engine.engine)
            b_res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())

            calculated_return = dx_return[dx_return.trade_date == ref_date]
            np.testing.assert_array_almost_equal(
                calculated_return.dx.values,
                res.chgPct.values - b_res.chgPct.values)

    def test_sql_engine_fetch_dx_return_with_universe_adjustment(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', '2017-01-26', '-6m'),
            '2017-01-26', '60b', 'china.sse')

        universe = Universe('zz500')
        dx_return = self.engine.fetch_dx_return_range(universe,
                                                      dates=ref_dates,
                                                      horizon=4,
                                                      offset=1)

        codes = self.engine.fetch_codes_range(universe, dates=ref_dates)
        groups = codes.groupby('trade_date')

        for ref_date, g in groups:
            start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
            end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

            query = select([Market.code, Market.chgPct]).where(
                and_(Market.trade_date.between(start_date, end_date),
                     Market.code.in_(g.code.unique().tolist())))

            df = pd.read_sql(query, con=self.engine.engine)
            res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())
            calculated_return = dx_return[dx_return.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_return.dx.values,
                                                 res.chgPct.values)

    def test_sql_engine_fetch_dx_return_index(self):
        horizon = 4
        offset = 1
        ref_date = self.ref_date
        dx_return = self.engine.fetch_dx_return_index(ref_date,
                                                      905,
                                                      horizon=horizon,
                                                      offset=offset)

        start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
        end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

        query = select([IndexMarket.indexCode, IndexMarket.chgPct]).where(
            and_(IndexMarket.trade_date.between(start_date, end_date),
                 IndexMarket.indexCode == 905))

        df = pd.read_sql(query, con=self.engine.engine)
        res = df.groupby('indexCode').apply(lambda x: np.log(1. + x).sum())
        np.testing.assert_array_almost_equal(dx_return.dx.values,
                                             res.chgPct.values)

    def test_sql_engine_fetch_dx_return_index_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        index_code = 906

        dx_return = self.engine.fetch_dx_return_index_range(index_code,
                                                            dates=ref_dates,
                                                            horizon=4,
                                                            offset=1)

        for ref_date in ref_dates:
            start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
            end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

            query = select([IndexMarket.indexCode, IndexMarket.chgPct]).where(
                and_(IndexMarket.trade_date.between(start_date, end_date),
                     IndexMarket.indexCode == index_code))

            df = pd.read_sql(query, con=self.engine.engine)
            res = df.groupby('indexCode').apply(lambda x: np.log(1. + x).sum())
            calculated_return = dx_return[dx_return.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_return.dx.values,
                                                 res.chgPct.values)

    def test_sql_engine_fetch_factor(self):
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)
        factor = 'ROE'

        factor_data = self.engine.fetch_factor(ref_date, factor, codes)

        query = select([Uqer.code, Uqer.ROE]).where(
            and_(Uqer.trade_date == ref_date, Uqer.code.in_(codes)))

        df = pd.read_sql(query, con=self.engine.engine).sort_values('code')
        np.testing.assert_array_almost_equal(factor_data.ROE.values,
                                             df.ROE.values)

    def test_sql_engine_fetch_factor_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        universe = Universe('zz500') + Universe('zz1000')
        factor = 'ROE'

        factor_data = self.engine.fetch_factor_range(universe,
                                                     factor,
                                                     dates=ref_dates)

        codes = self.engine.fetch_codes_range(universe, dates=ref_dates)
        groups = codes.groupby('trade_date')

        for ref_date, g in groups:
            query = select([Uqer.code, Uqer.ROE]).where(
                and_(Uqer.trade_date == ref_date,
                     Uqer.code.in_(g.code.unique().tolist())))

            df = pd.read_sql(query, con=self.engine.engine)
            calculated_factor = factor_data[factor_data.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_factor.ROE.values,
                                                 df.ROE.values)

    def test_sql_engine_fetch_factor_range_forward(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        ref_dates = ref_dates + [
            advanceDateByCalendar('china.sse', ref_dates[-1],
                                  '60b').strftime('%Y-%m-%d')
        ]
        universe = Universe('zz500') + Universe('zz1000')
        factor = 'ROE'

        factor_data = self.engine.fetch_factor_range_forward(universe,
                                                             factor,
                                                             dates=ref_dates)

        codes = self.engine.fetch_codes_range(universe, dates=ref_dates[:-1])
        groups = codes.groupby('trade_date')

        for ref_date, g in groups:
            forward_ref_date = advanceDateByCalendar(
                'china.sse', ref_date, '60b').strftime('%Y-%m-%d')
            query = select([Uqer.code, Uqer.ROE]).where(
                and_(Uqer.trade_date == forward_ref_date,
                     Uqer.code.in_(g.code.unique().tolist())))

            df = pd.read_sql(query, con=self.engine.engine)
            calculated_factor = factor_data[factor_data.trade_date == ref_date]
            calculated_factor.set_index('code', inplace=True)
            calculated_factor = calculated_factor.loc[df.code]
            np.testing.assert_array_almost_equal(calculated_factor.dx.values,
                                                 df.ROE.values)

    def test_sql_engine_fetch_benchmark(self):
        ref_date = self.ref_date
        benchmark = 906

        index_data = self.engine.fetch_benchmark(ref_date, benchmark)

        query = select([
            IndexComponent.code, (IndexComponent.weight / 100.).label('weight')
        ]).where(
            and_(IndexComponent.trade_date == ref_date,
                 IndexComponent.indexCode == benchmark))

        df = pd.read_sql(query, con=self.engine.engine)
        np.testing.assert_array_almost_equal(df.weight.values,
                                             index_data.weight.values)

    def test_sql_engine_fetch_benchmark_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-9m'),
            self.ref_date, '60b', 'china.sse')
        benchmark = 906
        index_data = self.engine.fetch_benchmark_range(benchmark,
                                                       dates=ref_dates)

        query = select([
            IndexComponent.trade_date, IndexComponent.code,
            (IndexComponent.weight / 100.).label('weight')
        ]).where(
            and_(IndexComponent.trade_date.in_(ref_dates),
                 IndexComponent.indexCode == benchmark))

        df = pd.read_sql(query, con=self.engine.engine)
        for ref_date in ref_dates:
            calculated_data = index_data[index_data.trade_date == ref_date]
            expected_data = df[df.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_data.weight.values,
                                                 expected_data.weight.values)

    def test_sql_engine_fetch_risk_model(self):
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)

        risk_cov, risk_exp = self.engine.fetch_risk_model(ref_date,
                                                          codes,
                                                          risk_model='short')
        self.assertListEqual(risk_cov.Factor.tolist(),
                             risk_cov.columns[2:].tolist())

        query = select([RiskCovShort
                        ]).where(RiskCovShort.trade_date == ref_date)

        cov_df = pd.read_sql(query,
                             con=self.engine.engine).sort_values('FactorID')
        factors = cov_df.Factor.tolist()
        np.testing.assert_array_almost_equal(risk_cov[factors].values,
                                             cov_df[factors].values)

        query = select([RiskExposure]).where(
            and_(RiskExposure.trade_date == ref_date,
                 RiskExposure.code.in_(codes)))

        exp_df = pd.read_sql(query, con=self.engine.engine)
        np.testing.assert_array_almost_equal(exp_df[factors].values,
                                             risk_exp[factors].values)

    def test_sql_engine_fetch_industry_matrix(self):
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)

        ind_matrix = self.engine.fetch_industry_matrix(ref_date, codes, 'sw',
                                                       1)

        query = select([Industry.code, Industry.industryName1]).where(
            and_(Industry.trade_date == ref_date,
                 Industry.industry == '申万行业分类', Industry.code.in_(codes)))

        df = pd.read_sql(query, con=self.engine.engine)
        df = pd.get_dummies(df, prefix="", prefix_sep="")

        self.assertEqual(len(ind_matrix), len(df))
        np.testing.assert_array_almost_equal(df[ind_matrix.columns[2:]].values,
                                             ind_matrix.iloc[:, 2:].values)

    def test_sql_engine_fetch_factor_by_categories(self):
        ref_date = self.ref_date
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes(ref_date, universe)

        factor1 = {'f': CSRank('ROE', groups='sw1')}
        factor2 = {'f': CSQuantiles('ROE', groups='sw1')}
        raw_factor = 'ROE'

        df1 = self.engine.fetch_factor(ref_date, factor1, codes)
        df2 = self.engine.fetch_factor(ref_date, factor2, codes)
        df3 = self.engine.fetch_factor(ref_date, raw_factor, codes)

        ind_matrix = self.engine.fetch_industry_matrix(ref_date, codes, 'sw',
                                                       1)
        cols = sorted(ind_matrix.columns[2:].tolist())

        series = (ind_matrix[cols] *
                  np.array(range(1,
                                 len(cols) + 1))).sum(axis=1)
        df3['cat'] = series.values

        expected_rank = df3[[
            'ROE', 'cat'
        ]].groupby('cat').transform(lambda x: rankdata(x.values))
        expected_rank[np.isnan(df3.ROE)] = np.nan
        df3['rank'] = expected_rank['ROE'].values
        np.testing.assert_array_almost_equal(df3['rank'].values,
                                             df1['f'].values)

        expected_quantile = df3[['ROE', 'cat']].groupby('cat').transform(
            lambda x: rankdata(x.values) / (len(x) + 1))
        expected_quantile[np.isnan(df3.ROE)] = np.nan
        df3['quantile'] = expected_quantile['ROE'].values
        np.testing.assert_array_almost_equal(df3['quantile'].values,
                                             df2['f'].values)
Beispiel #8
0
                               fetch_data_package,
                               map_freq)
    from PyFin.api import LAST, SHIFT

    freq = '60b'
    universe = Universe('custom', ['ashare_ex'])
    batch = 1
    neutralized_risk = industry_styles
    risk_model = 'short'
    pre_process = [winsorize_normal, standardize]
    post_process = [standardize]
    warm_start = 3
    data_source = None
    horizon = map_freq(freq)

    engine = SqlEngine(data_source)

    fit_intercept = True
    kernal_feature = 'roe_q'
    regress_features = {kernal_feature: LAST(kernal_feature),
                        kernal_feature + '_l1': SHIFT(kernal_feature, 1),
                        kernal_feature + '_l2': SHIFT(kernal_feature, 2),
                        kernal_feature + '_l3': SHIFT(kernal_feature, 3)
                        }
    const_features = {kernal_feature: LAST(kernal_feature)}
    fit_target = [kernal_feature]

    data_meta = DataMeta(freq=freq,
                         universe=universe,
                         batch=batch,
                         neutralized_risk=neutralized_risk,
Beispiel #9
0
                    base_universe=self.base_universe,
                    exclude_universe=self.exclude_universe,
                    special_codes=self.special_codes,
                    filter_cond=encode(self.filter_cond))

    @classmethod
    def load(cls, universe_desc: dict):
        name = universe_desc['name']
        base_universe = universe_desc['base_universe']
        exclude_universe = universe_desc['exclude_universe']
        special_codes = universe_desc['special_codes']
        filter_cond = decode(universe_desc['filter_cond'])

        return cls(name=name,
                   base_universe=base_universe,
                   exclude_universe=exclude_universe,
                   special_codes=special_codes,
                   filter_cond=filter_cond)


if __name__ == '__main__':
    from PyFin.api import *
    from alphamind.data.engines.sqlengine import SqlEngine

    engine = SqlEngine()
    universe = Universe('custom', ['zz800'], exclude_universe=['Bank'])
    print(
        universe.query(engine, start_date='2018-04-26', end_date='2018-04-26'))

    print(universe.query(engine, dates=['2017-12-21', '2017-12-25']))
Beispiel #10
0
def prepare_data(engine: SqlEngine,
                 factors: Union[Transformer, Iterable[object]],
                 start_date: str,
                 end_date: str,
                 frequency: str,
                 universe: Universe,
                 benchmark: int,
                 warm_start: int = 0,
                 fit_target: Union[Transformer, object] = None):
    if warm_start > 0:
        p = Period(frequency)
        p = Period(length=-warm_start * p.length(), units=p.units())
        start_date = advanceDateByCalendar('china.sse', start_date,
                                           p).strftime('%Y-%m-%d')

    dates = makeSchedule(start_date,
                         end_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Forward)

    dates = [d.strftime('%Y-%m-%d') for d in dates]

    horizon = map_freq(frequency)

    if isinstance(factors, Transformer):
        transformer = factors
    else:
        transformer = Transformer(factors)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).sort_values(
                                              ['trade_date', 'code'])
    alpha_logger.info("factor data loading finished")

    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))
    alpha_logger.info("fit target data loading finished")

    industry_df = engine.fetch_industry_range(universe, dates=dates)
    alpha_logger.info("industry data loading finished")
    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
    alpha_logger.info("benchmark data loading finished")

    df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()
    df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
    df = pd.merge(df, industry_df, on=['trade_date', 'code'])
    df['weight'] = df['weight'].fillna(0.)
    df.dropna(inplace=True)

    return dates, df[[
        'trade_date', 'code', 'dx'
    ]], df[['trade_date', 'code', 'weight', 'industry_code', 'industry'] +
           transformer.names]
Beispiel #11
0
class Strategy(object):
    def __init__(self,
                 alpha_model,
                 data_meta,
                 universe,
                 start_date,
                 end_date,
                 freq,
                 benchmark=905,
                 industry_cat='sw_adj',
                 industry_level=1,
                 dask_client=None):
        self.alpha_model = alpha_model
        self.data_meta = data_meta
        self.universe = universe
        self.benchmark = benchmark
        self.dates = makeSchedule(start_date, end_date, freq, 'china.sse')
        self.dates = [d.strftime('%Y-%m-%d') for d in self.dates]
        self.industry_cat = industry_cat
        self.industry_level = industry_level
        self.freq = freq
        self.horizon = map_freq(freq)
        self.engine = SqlEngine(self.data_meta.data_source)
        self.dask_client = dask_client

    def prepare_backtest_data(self):
        total_factors = self.engine.fetch_factor_range(
            self.universe, self.alpha_model.formulas, dates=self.dates)
        alpha_logger.info("alpha factor data loading finished ...")

        total_industry = self.engine.fetch_industry_matrix_range(
            self.universe,
            dates=self.dates,
            category=self.industry_cat,
            level=self.industry_level)
        alpha_logger.info("industry data loading finished ...")

        total_benchmark = self.engine.fetch_benchmark_range(
            dates=self.dates, benchmark=self.benchmark)
        alpha_logger.info("benchmark data loading finished ...")

        total_risk_cov, total_risk_exposure = self.engine.fetch_risk_model_range(
            self.universe,
            dates=self.dates,
            risk_model=self.data_meta.risk_model)
        alpha_logger.info("risk_model data loading finished ...")

        total_returns = self.engine.fetch_dx_return_range(self.universe,
                                                          dates=self.dates,
                                                          horizon=self.horizon,
                                                          offset=1)
        alpha_logger.info("returns data loading finished ...")

        total_data = pd.merge(total_factors,
                              total_industry,
                              on=['trade_date', 'code'])
        total_data = pd.merge(total_data,
                              total_benchmark,
                              on=['trade_date', 'code'],
                              how='left')
        total_data.fillna({'weight': 0.}, inplace=True)
        total_data = pd.merge(total_data,
                              total_returns,
                              on=['trade_date', 'code'])
        total_data = pd.merge(total_data,
                              total_risk_exposure,
                              on=['trade_date', 'code'])

        is_in_benchmark = (total_data.weight >
                           0.).astype(float).values.reshape((-1, 1))
        total_data.loc[:, 'benchmark'] = is_in_benchmark
        total_data.loc[:, 'total'] = np.ones_like(is_in_benchmark)
        total_data.sort_values(['trade_date', 'code'], inplace=True)
        self.index_return = self.engine.fetch_dx_return_index_range(
            self.benchmark, dates=self.dates, horizon=self.horizon,
            offset=1).set_index('trade_date')
        self.total_data = total_data
        self.total_risk_cov = total_risk_cov

    def _create_lu_bounds(self, running_setting, codes, benchmark_w):

        codes = np.array(codes)

        if running_setting.weights_bandwidth:
            lbound = np.maximum(
                0., benchmark_w - running_setting.weights_bandwidth)
            ubound = running_setting.weights_bandwidth + benchmark_w

        lb = running_setting.lbound
        ub = running_setting.ubound

        if lb or ub:
            if not isinstance(lb, dict):
                lbound = np.ones_like(benchmark_w) * lb
            else:
                lbound = np.zeros_like(benchmark_w)
                for c in lb:
                    lbound[codes == c] = lb[c]

                if 'other' in lb:
                    for i, c in enumerate(codes):
                        if c not in lb:
                            lbound[i] = lb['other']
            if not isinstance(ub, dict):
                ubound = np.ones_like(benchmark_w) * ub
            else:
                ubound = np.ones_like(benchmark_w)
                for c in ub:
                    ubound[codes == c] = ub[c]

                if 'other' in ub:
                    for i, c in enumerate(codes):
                        if c not in ub:
                            ubound[i] = ub['other']
        return lbound, ubound

    def run(self, running_setting):
        alpha_logger.info("starting backting ...")
        total_data_groups = self.total_data.groupby('trade_date')

        rets = []
        turn_overs = []
        leverags = []
        previous_pos = pd.DataFrame()
        executor = copy.deepcopy(running_setting.executor)
        positions = pd.DataFrame()

        if self.dask_client is None:
            models = {}
            for ref_date, _ in total_data_groups:
                models[ref_date] = train_model(ref_date.strftime('%Y-%m-%d'),
                                               self.alpha_model,
                                               self.data_meta)
        else:

            def worker(parameters):
                new_model = train_model(parameters[0].strftime('%Y-%m-%d'),
                                        parameters[1], parameters[2])
                return parameters[0], new_model

            l = self.dask_client.map(worker,
                                     [(d[0], self.alpha_model, self.data_meta)
                                      for d in total_data_groups])
            results = self.dask_client.gather(l)
            models = dict(results)

        for ref_date, this_data in total_data_groups:
            new_model = models[ref_date]
            codes = this_data.code.values.tolist()

            if previous_pos.empty:
                current_position = None
            else:
                previous_pos.set_index('code', inplace=True)
                remained_pos = previous_pos.loc[codes]

                remained_pos.fillna(0., inplace=True)
                current_position = remained_pos.weight.values

            if running_setting.rebalance_method == 'tv':
                risk_cov = self.total_risk_cov[self.total_risk_cov.trade_date
                                               == ref_date]
                sec_cov = self._generate_sec_cov(this_data, risk_cov)
            else:
                sec_cov = None

            benchmark_w = this_data.weight.values
            constraints = LinearConstraints(running_setting.bounds, this_data,
                                            benchmark_w)

            lbound, ubound = self._create_lu_bounds(running_setting, codes,
                                                    benchmark_w)

            features = new_model.features
            dfs = []
            for name in features:
                data_cleaned = this_data.dropna(subset=[name])
                raw_factors = data_cleaned[[name]].values
                new_factors = factor_processing(
                    raw_factors,
                    pre_process=self.data_meta.pre_process,
                    risk_factors=data_cleaned[
                        self.data_meta.neutralized_risk].values.astype(float)
                    if self.data_meta.neutralized_risk else None,
                    post_process=self.data_meta.post_process)
                df = pd.DataFrame(new_factors,
                                  columns=[name],
                                  index=data_cleaned.code)
                dfs.append(df)

            new_factors = pd.concat(dfs, axis=1)
            new_factors = new_factors.loc[codes].fillna(new_factors.median())
            er = new_model.predict(new_factors).astype(float)

            alpha_logger.info('{0} re-balance: {1} codes'.format(
                ref_date, len(er)))
            target_pos = self._calculate_pos(running_setting,
                                             er,
                                             this_data,
                                             constraints,
                                             benchmark_w,
                                             lbound,
                                             ubound,
                                             sec_cov=sec_cov,
                                             current_position=current_position)

            target_pos['code'] = codes
            target_pos['trade_date'] = ref_date

            turn_over, executed_pos = executor.execute(target_pos=target_pos)
            leverage = executed_pos.weight.abs().sum()

            ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) -
                                                1.)
            rets.append(np.log(1. + ret))
            executor.set_current(executed_pos)
            turn_overs.append(turn_over)
            leverags.append(leverage)
            positions = positions.append(executed_pos)
            previous_pos = executed_pos

        positions['benchmark_weight'] = self.total_data['weight'].values
        positions['dx'] = self.total_data.dx.values

        trade_dates = positions.trade_date.unique()
        ret_df = pd.DataFrame(
            {
                'returns': rets,
                'turn_over': turn_overs,
                'leverage': leverags
            },
            index=trade_dates)

        ret_df['benchmark_returns'] = self.index_return['dx']
        ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1],
                                         self.freq)] = 0.
        ret_df = ret_df.shift(1)
        ret_df.iloc[0] = 0.
        ret_df['excess_return'] = ret_df[
            'returns'] - ret_df['benchmark_returns'] * ret_df['leverage']

        return ret_df, positions

    @staticmethod
    def _generate_sec_cov(current_data, risk_cov):
        risk_exposure = current_data[all_styles].values
        risk_cov = risk_cov[all_styles].values
        special_risk = current_data['srisk'].values
        sec_cov = risk_exposure @ risk_cov @ risk_exposure.T / 10000 + np.diag(
            special_risk**2) / 10000
        return sec_cov

    def _calculate_pos(self, running_setting, er, data, constraints,
                       benchmark_w, lbound, ubound, sec_cov, current_position):
        more_opts = running_setting.more_opts
        target_pos, _ = er_portfolio_analysis(
            er,
            industry=data.industry_name.values,
            dx_return=None,
            constraints=constraints,
            detail_analysis=False,
            benchmark=benchmark_w,
            method=running_setting.rebalance_method,
            lbound=lbound,
            ubound=ubound,
            current_position=current_position,
            target_vol=more_opts.get('target_vol'),
            cov=sec_cov,
            turn_over_target=more_opts.get('turn_over_target'))
        return target_pos
Beispiel #12
0
class Strategy(object):
    def __init__(self,
                 alpha_model,
                 data_meta,
                 running_setting,
                 dask_client=None):
        self.alpha_model = alpha_model
        self.data_meta = data_meta
        self.running_setting = running_setting
        self.engine = SqlEngine(self.data_meta.data_source)
        self.dask_client = dask_client

    def run(self):
        alpha_logger.info("starting backting ...")

        total_factors = self.engine.fetch_factor_range(
            self.running_setting.universe,
            self.alpha_model.formulas,
            dates=self.running_setting.dates)
        alpha_logger.info("alpha factor data loading finished ...")

        total_industry = self.engine.fetch_industry_matrix_range(
            self.running_setting.universe,
            dates=self.running_setting.dates,
            category=self.running_setting.industry_cat,
            level=self.running_setting.industry_level)
        alpha_logger.info("industry data loading finished ...")

        total_benchmark = self.engine.fetch_benchmark_range(
            dates=self.running_setting.dates,
            benchmark=self.running_setting.benchmark)
        alpha_logger.info("benchmark data loading finished ...")

        total_risk_cov, total_risk_exposure = self.engine.fetch_risk_model_range(
            self.running_setting.universe,
            dates=self.running_setting.dates,
            risk_model=self.data_meta.risk_model)
        alpha_logger.info("risk_model data loading finished ...")

        total_returns = self.engine.fetch_dx_return_range(
            self.running_setting.universe,
            dates=self.running_setting.dates,
            horizon=self.running_setting.horizon,
            offset=1)
        alpha_logger.info("returns data loading finished ...")

        total_data = pd.merge(total_factors,
                              total_industry,
                              on=['trade_date', 'code'])
        total_data = pd.merge(total_data,
                              total_benchmark,
                              on=['trade_date', 'code'],
                              how='left')
        total_data.fillna({'weight': 0.}, inplace=True)
        total_data = pd.merge(total_data,
                              total_returns,
                              on=['trade_date', 'code'])
        total_data = pd.merge(total_data,
                              total_risk_exposure,
                              on=['trade_date', 'code'])

        is_in_benchmark = (total_data.weight > 0.).astype(float).reshape(
            (-1, 1))
        total_data.loc[:, 'benchmark'] = is_in_benchmark
        total_data.loc[:, 'total'] = np.ones_like(is_in_benchmark)
        total_data.sort_values(['trade_date', 'code'], inplace=True)
        total_data_groups = total_data.groupby('trade_date')

        rets = []
        turn_overs = []
        leverags = []
        previous_pos = pd.DataFrame()
        executor = copy.deepcopy(self.running_setting.executor)
        positions = pd.DataFrame()

        if self.dask_client is None:
            models = {}
            for ref_date, _ in total_data_groups:
                models[ref_date] = train_model(ref_date.strftime('%Y-%m-%d'),
                                               self.alpha_model,
                                               self.data_meta)
        else:

            def worker(parameters):
                new_model = train_model(parameters[0].strftime('%Y-%m-%d'),
                                        parameters[1], parameters[2])
                return parameters[0], new_model

            l = self.dask_client.map(worker,
                                     [(d[0], self.alpha_model, self.data_meta)
                                      for d in total_data_groups])
            results = self.dask_client.gather(l)
            models = dict(results)

        for ref_date, this_data in total_data_groups:
            new_model = models[ref_date]

            this_data = this_data.fillna(
                this_data[new_model.features].median())
            codes = this_data.code.values.tolist()

            if self.running_setting.rebalance_method == 'tv':
                risk_cov = total_risk_cov[total_risk_cov.trade_date ==
                                          ref_date]
                sec_cov = self._generate_sec_cov(this_data, risk_cov)
            else:
                sec_cov = None

            benchmark_w = this_data.weight.values
            constraints = LinearConstraints(self.running_setting.bounds,
                                            this_data, benchmark_w)

            lbound = np.maximum(
                0., benchmark_w - self.running_setting.weights_bandwidth)
            ubound = self.running_setting.weights_bandwidth + benchmark_w

            if previous_pos.empty:
                current_position = None
            else:
                previous_pos.set_index('code', inplace=True)
                remained_pos = previous_pos.loc[codes]

                remained_pos.fillna(0., inplace=True)
                current_position = remained_pos.weight.values

            features = new_model.features
            raw_factors = this_data[features].values
            new_factors = factor_processing(
                raw_factors,
                pre_process=self.data_meta.pre_process,
                risk_factors=this_data[self.data_meta.neutralized_risk].values.
                astype(float) if self.data_meta.neutralized_risk else None,
                post_process=self.data_meta.post_process)

            er = new_model.predict(pd.DataFrame(
                new_factors, columns=features)).astype(float)

            alpha_logger.info('{0} re-balance: {1} codes'.format(
                ref_date, len(er)))
            target_pos = self._calculate_pos(er,
                                             this_data,
                                             constraints,
                                             benchmark_w,
                                             lbound,
                                             ubound,
                                             sec_cov=sec_cov,
                                             current_position=current_position,
                                             **self.running_setting.more_opts)

            target_pos['code'] = codes
            target_pos['trade_date'] = ref_date

            turn_over, executed_pos = executor.execute(target_pos=target_pos)
            leverage = executed_pos.weight.abs().sum()

            ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) -
                                                1.)
            rets.append(np.log(1. + ret))
            executor.set_current(executed_pos)
            turn_overs.append(turn_over)
            leverags.append(leverage)
            positions = positions.append(executed_pos)
            previous_pos = executed_pos

        positions['benchmark_weight'] = total_data['weight'].values
        positions['dx'] = total_data.dx.values

        trade_dates = positions.trade_date.unique()
        ret_df = pd.DataFrame(
            {
                'returns': rets,
                'turn_over': turn_overs,
                'leverage': leverags
            },
            index=trade_dates)

        index_return = self.engine.fetch_dx_return_index_range(
            self.running_setting.benchmark,
            dates=self.running_setting.dates,
            horizon=self.running_setting.horizon,
            offset=1).set_index('trade_date')
        ret_df['benchmark_returns'] = index_return['dx']
        ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1],
                                         self.running_setting.freq)] = 0.
        ret_df = ret_df.shift(1)
        ret_df.iloc[0] = 0.
        ret_df['excess_return'] = ret_df[
            'returns'] - ret_df['benchmark_returns'] * ret_df['leverage']

        return ret_df, positions

    @staticmethod
    def _generate_sec_cov(current_data, risk_cov):
        risk_exposure = current_data[all_styles].values
        risk_cov = risk_cov[all_styles].values
        special_risk = current_data['srisk'].values
        sec_cov = risk_exposure @ risk_cov @ risk_exposure.T / 10000 + np.diag(
            special_risk**2) / 10000
        return sec_cov

    def _calculate_pos(self, er, data, constraints, benchmark_w, lbound,
                       ubound, **kwargs):
        target_pos, _ = er_portfolio_analysis(
            er,
            industry=data.industry_name.values,
            dx_return=None,
            constraints=constraints,
            detail_analysis=False,
            benchmark=benchmark_w,
            method=self.running_setting.rebalance_method,
            lbound=lbound,
            ubound=ubound,
            current_position=kwargs.get('current_position'),
            target_vol=kwargs.get('target_vol'),
            cov=kwargs.get('sec_cov'),
            turn_over_target=kwargs.get('turn_over_target'))
        return target_pos
Beispiel #13
0
        right_index = bisect.bisect_right(date_label, end)

        codes = train_x.code.values[left_index:right_index]
    else:
        ne_x = None
        codes = None

    ret = dict()
    ret['x_names'] = transformer.names
    ret['predict'] = {'x': ne_x, 'code': codes}

    return ret


if __name__ == '__main__':
    engine = SqlEngine(
        'postgresql+psycopg2://postgres:[email protected]/alpha')
    universe = Universe('zz500', ['ashare_ex'])
    neutralized_risk = ['SIZE']
    res = fetch_train_phase(engine, ['EPS', 'CFinc1'],
                            '2017-09-04',
                            '2w',
                            universe,
                            4,
                            warm_start=1,
                            neutralized_risk=neutralized_risk)

    print(res)

    res = fetch_predict_phase(engine, ['EPS', 'CFinc1'],
                              '2017-09-04',
                              '2w',