Beispiel #1
0
    def __init__(self, **kwargs):
        super(DXDataCenter, self).__init__(kwargs['logger'],
                                           kwargs['symbolList'])
        self.fields = [
            'productID', 'instrumentID', 'tradingDate', 'tradingTime',
            'openPrice', 'highPrice', 'lowPrice', 'closePrice', 'volume',
            'multiplier', 'openInterest'
        ]
        self.startDate = kwargs['startDate']
        self.endDate = kwargs['endDate']
        self._freq = kwargs['freq']
        self.baseDate = adjustDateByCalendar('China.SSE', self.endDate,
                                             BizDayConventions.Preceding)

        if not Settings.usingCache:
            self.forceUpdate = True
        else:
            self.forceUpdate = False

        if self._freq == FreqType.MIN5:
            self.loadSchedule = makeSchedule(self.startDate, self.endDate,
                                             '1m')
        elif self._freq == FreqType.MIN1:
            self.loadSchedule = makeSchedule(self.startDate, self.endDate,
                                             '7d')
        else:
            self.loadSchedule = [self.startDate, self.endDate]

        self.schCurrEnd = 1

        while True:
            self._getMinutesBars(
                startDate=self.loadSchedule[self.schCurrEnd -
                                            1].strftime("%Y-%m-%d"),
                endDate=self.loadSchedule[self.schCurrEnd].strftime(
                    "%Y-%m-%d"),
                freq=self._freq)
            if self.symbolList:
                break
            self.logger.warning('There is no any valid data in the back-testing data range ({0} - {1})' \
                                .format(self.loadSchedule[self.schCurrEnd - 1], self.loadSchedule[self.schCurrEnd]))
            if self.schCurrEnd == len(self.loadSchedule) - 1:
                break
            self.schCurrEnd += 1

        if not self.symbolList:
            raise ValueError(
                'There is no any valid data in the back-testing whole data range'
            )

        if kwargs['benchmark']:
            self._getBenchmarkData(kwargs['benchmark'],
                                   self.startDate.strftime("%Y-%m-%d"),
                                   self.endDate.strftime("%Y-%m-%d"))
    def __init__(self, **kwargs):
        super(DXDataCenter, self).__init__(kwargs['logger'], kwargs['symbolList'])
        self.fields = ['productID',
                       'instrumentID',
                       'tradingDate',
                       'tradingTime',
                       'openPrice',
                       'highPrice',
                       'lowPrice',
                       'closePrice',
                       'volume',
                       'multiplier',
                       'openInterest']
        self.startDate = kwargs['startDate']
        self.endDate = kwargs['endDate']
        self._freq = kwargs['freq']
        self.baseDate = adjustDateByCalendar('China.SSE', self.endDate, BizDayConventions.Preceding)

        if not Settings.usingCache:
            self.forceUpdate = True
        else:
            self.forceUpdate = False

        if self._freq == FreqType.MIN5:
            self.loadSchedule = makeSchedule(self.startDate, self.endDate, '1m')
        elif self._freq == FreqType.MIN1:
            self.loadSchedule = makeSchedule(self.startDate, self.endDate, '7d')
        else:
            self.loadSchedule = [self.startDate, self.endDate]

        self.schCurrEnd = 1

        while True:
            self._getMinutesBars(startDate=self.loadSchedule[self.schCurrEnd - 1].strftime("%Y-%m-%d"),
                                 endDate=self.loadSchedule[self.schCurrEnd].strftime("%Y-%m-%d"),
                                 freq=self._freq)
            if self.symbolList:
                break
            self.logger.warning('There is no any valid data in the back-testing data range ({0} - {1})' \
                                .format(self.loadSchedule[self.schCurrEnd - 1], self.loadSchedule[self.schCurrEnd]))
            if self.schCurrEnd == len(self.loadSchedule) - 1:
                break
            self.schCurrEnd += 1

        if not self.symbolList:
            raise ValueError('There is no any valid data in the back-testing whole data range')

        if kwargs['benchmark']:
            self._getBenchmarkData(kwargs['benchmark'],
                                   self.startDate.strftime("%Y-%m-%d"),
                                   self.endDate.strftime("%Y-%m-%d"))
Beispiel #3
0
 def __init__(self,
              alpha_model,
              data_meta,
              universe,
              start_date,
              end_date,
              freq,
              benchmark=905,
              industry_cat='sw_adj',
              industry_level=1,
              dask_client=None):
     self.alpha_model = alpha_model
     self.data_meta = data_meta
     self.universe = universe
     self.benchmark = benchmark
     self.dates = makeSchedule(start_date, end_date, freq, 'china.sse')
     self.dates = [d.strftime('%Y-%m-%d') for d in self.dates]
     self.industry_cat = industry_cat
     self.industry_level = industry_level
     self.freq = freq
     self.horizon = map_freq(freq)
     self.engine = SqlEngine(self.data_meta.data_source)
     self.dask_client = dask_client
     self.total_data = None
     self.index_return = None
     self.risk_models = None
     self.alpha_models = None
Beispiel #4
0
 def __init__(self,
              universe,
              start_date,
              end_date,
              freq,
              benchmark=905,
              weights_bandwidth=0.02,
              industry_cat='sw_adj',
              industry_level=1,
              rebalance_method='risk_neutral',
              bounds=None,
              **kwargs):
     self.universe = universe
     self.dates = makeSchedule(start_date, end_date, freq, 'china.sse')
     self.dates = [d.strftime('%Y-%m-%d') for d in self.dates]
     self.benchmark = benchmark
     self.weights_bandwidth = weights_bandwidth
     self.freq = freq
     self.horizon = map_freq(freq)
     self.executor = NaiveExecutor()
     self.industry_cat = industry_cat
     self.industry_level = industry_level
     self.rebalance_method = rebalance_method
     self.bounds = bounds
     self.more_opts = kwargs
Beispiel #5
0
 def factor(self,
            factor_category,
            begin_date,
            end_date,
            factor_name=None,
            freq=None):
     if factor_name is None:
         table = importlib.import_module(
             'data.factor_model').__getattribute__(factor_category)
         return self.base(table, begin_date, end_date, freq)
     else:
         table = importlib.import_module(
             'data.factor_model').__getattribute__(factor_category)
         key_sets = ['id', 'security_code', 'trade_date'] + factor_name
         db_columns = []
         for key in key_sets:
             db_columns.append(table.__dict__[key])
         if freq is None:
             query = select(db_columns).where(
                 and_(
                     table.trade_date >= begin_date,
                     table.trade_date <= end_date,
                 ))
         else:
             rebalance_dates = makeSchedule(begin_date, end_date, freq,
                                            'china.sse',
                                            BizDayConventions.Preceding)
             query = select(db_columns).where(
                 table.trade_date.in_(rebalance_dates))
         return pd.read_sql(query, self._engine.sql_engine()).drop(['id'],
                                                                   axis=1)
Beispiel #6
0
def factor_combination(engine, factors, universe_name_list, start_date,
                       end_date, freq):
    universe = None
    for name in universe_name_list:
        if universe is None:
            universe = Universe(name)
        else:
            universe += Universe(name)
    dates = makeSchedule(start_date, end_date, freq, calendar='china.sse')
    factor_negMkt = engine.fetch_factor_range(universe,
                                              "negMarketValue",
                                              dates=dates)
    risk_cov, risk_factors = engine.fetch_risk_model_range(universe,
                                                           dates=dates)
    dx_returns = engine.fetch_dx_return_range(universe,
                                              dates=dates,
                                              horizon=map_freq(freq))

    # data combination
    total_data = pd.merge(factors, risk_factors, on=['trade_date', 'code'])
    total_data = pd.merge(total_data, factor_negMkt, on=['trade_date', 'code'])
    total_data = pd.merge(total_data, dx_returns, on=['trade_date', 'code'])

    industry_category = engine.fetch_industry_range(universe, dates=dates)
    total_data = pd.merge(total_data,
                          industry_category,
                          on=['trade_date', 'code']).dropna()
    total_data.dropna(inplace=True)
    return total_data
Beispiel #7
0
    def test_sql_engine_fetch_dx_return_with_universe_adjustment(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', '2017-01-26', '-6m'),
            '2017-01-26', '60b', 'china.sse')

        universe = Universe('zz500')
        dx_return = self.engine.fetch_dx_return_range(universe,
                                                      dates=ref_dates,
                                                      horizon=4,
                                                      offset=1)

        codes = self.engine.fetch_codes_range(universe, dates=ref_dates)
        groups = codes.groupby('trade_date')

        for ref_date, g in groups:
            start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
            end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

            query = select([Market.code, Market.chgPct]).where(
                and_(Market.trade_date.between(start_date, end_date),
                     Market.code.in_(g.code.unique().tolist())))

            df = pd.read_sql(query, con=self.engine.engine)
            res = df.groupby('code').apply(lambda x: np.log(1. + x).sum())
            calculated_return = dx_return[dx_return.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_return.dx.values,
                                                 res.chgPct.values)
Beispiel #8
0
    def test_sql_engine_fetch_factor_range_forward(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        ref_dates = ref_dates + [
            advanceDateByCalendar('china.sse', ref_dates[-1],
                                  '60b').strftime('%Y-%m-%d')
        ]
        universe = Universe('zz500') + Universe('zz1000')
        factor = 'ROE'

        factor_data = self.engine.fetch_factor_range_forward(universe,
                                                             factor,
                                                             dates=ref_dates)

        codes = self.engine.fetch_codes_range(universe, dates=ref_dates[:-1])
        groups = codes.groupby('trade_date')

        for ref_date, g in groups:
            forward_ref_date = advanceDateByCalendar(
                'china.sse', ref_date, '60b').strftime('%Y-%m-%d')
            query = select([Uqer.code, Uqer.ROE]).where(
                and_(Uqer.trade_date == forward_ref_date,
                     Uqer.code.in_(g.code.unique().tolist())))

            df = pd.read_sql(query, con=self.engine.engine)
            calculated_factor = factor_data[factor_data.trade_date == ref_date]
            calculated_factor.set_index('code', inplace=True)
            calculated_factor = calculated_factor.loc[df.code]
            np.testing.assert_array_almost_equal(calculated_factor.dx.values,
                                                 df.ROE.values)
Beispiel #9
0
def prepare_data(engine: SqlEngine,
                 factors: Union[Transformer, Iterable[object]],
                 start_date: str,
                 end_date: str,
                 frequency: str,
                 universe: Universe,
                 benchmark: int,
                 warm_start: int = 0):
    if warm_start > 0:
        p = Period(frequency)
        p = Period(length=-warm_start * p.length(), units=p.units())
        start_date = advanceDateByCalendar('china.sse', start_date,
                                           p).strftime('%Y-%m-%d')

    dates = makeSchedule(start_date,
                         end_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Forward)

    dates = [d.strftime('%Y-%m-%d') for d in dates]

    horizon = map_freq(frequency)

    if isinstance(factors, Transformer):
        transformer = factors
    else:
        transformer = Transformer(factors)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).sort_values(
                                              ['trade_date', 'code'])
    alpha_logger.info("factor data loading finished")
    return_df = engine.fetch_dx_return_range(universe,
                                             dates=dates,
                                             horizon=horizon)
    alpha_logger.info("return data loading finished")
    industry_df = engine.fetch_industry_range(universe, dates=dates)
    alpha_logger.info("industry data loading finished")
    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
    alpha_logger.info("benchmark data loading finished")

    df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()
    df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
    df = pd.merge(df, industry_df, on=['trade_date', 'code'])
    df['weight'] = df['weight'].fillna(0.)

    return dates, df[['trade_date', 'code', 'dx']], df[[
        'trade_date', 'code', 'weight', 'isOpen', 'industry_code', 'industry'
    ] + transformer.names]
Beispiel #10
0
 def base(self, table_name, begin_date, end_date, freq=None):
     if freq is None:
         query = select([table_name]).where(
             and_(
                 table_name.trade_date >= begin_date,
                 table_name.trade_date <= end_date,
             ))
     else:
         rebalance_dates = makeSchedule(begin_date, end_date, freq,
                                        'china.sse',
                                        BizDayConventions.Preceding)
         query = select([table_name]).where(
             table_name.trade_date.in_(rebalance_dates))
     return pd.read_sql(query, self._engine.sql_engine())
Beispiel #11
0
 def index(self, benchmark, begin_date, end_date, freq=None):
     table = importlib.import_module('data.rl_model').Index
     if freq is None:
         query = select([table]).where(
             and_(table.trade_date >= begin_date,
                  table.trade_date <= end_date,
                  table.isymbol.in_(benchmark)))
     else:
         rebalance_dates = makeSchedule(begin_date, end_date, freq,
                                        'china.sse',
                                        BizDayConventions.Preceding)
         query = select([table]).where(
             and_(table.trade_date.in_(rebalance_dates),
                  table.isymbol.in_(benchmark)))
     return pd.read_sql(query, self._engine.sql_engine()).drop(['id'],
                                                               axis=1)
Beispiel #12
0
    def test_sql_engine_fetch_codes_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        universe = Universe('zz500') + Universe('zz1000')
        codes = self.engine.fetch_codes_range(universe, dates=ref_dates)

        query = select([UniverseTable.trade_date, UniverseTable.code]).where(
            and_(UniverseTable.trade_date.in_(ref_dates),
                 or_(UniverseTable.zz500 == 1, UniverseTable.zz1000 == 1)))

        df = pd.read_sql(query, con=self.engine.engine).sort_values('code')

        for ref_date in ref_dates:
            calculated_codes = list(
                sorted(codes[codes.trade_date == ref_date].code.values))
            expected_codes = list(
                sorted(df[df.trade_date == ref_date].code.values))
            self.assertListEqual(calculated_codes, expected_codes)
Beispiel #13
0
    def test_sql_engine_fetch_benchmark_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-9m'),
            self.ref_date, '60b', 'china.sse')
        benchmark = 906
        index_data = self.engine.fetch_benchmark_range(benchmark,
                                                       dates=ref_dates)

        query = select([
            IndexComponent.trade_date, IndexComponent.code,
            (IndexComponent.weight / 100.).label('weight')
        ]).where(
            and_(IndexComponent.trade_date.in_(ref_dates),
                 IndexComponent.indexCode == benchmark))

        df = pd.read_sql(query, con=self.engine.engine)
        for ref_date in ref_dates:
            calculated_data = index_data[index_data.trade_date == ref_date]
            expected_data = df[df.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_data.weight.values,
                                                 expected_data.weight.values)
Beispiel #14
0
    def fetch_factors(self, begin_date, end_date, freq=None):
        if freq is None:
            query = select([self._table]).where(
                and_(
                    self._table.trade_date >= begin_date,
                    self._table.trade_date <= end_date,
                ))
        else:
            rebalance_dates = makeSchedule(begin_date, end_date, freq,
                                           'china.sse',
                                           BizDayConventions.Preceding)
            query = select([self._table]).where(
                and_(self._table.trade_date.in_(rebalance_dates), ))

        data = pd.read_sql(query, self._engine)

        for col in ['id', 'creat_time', 'update_time']:
            if col in data.columns:
                data = data.drop([col], axis=1)

        return data
Beispiel #15
0
    def test_sql_engine_fetch_factor_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        universe = Universe('zz500') + Universe('zz1000')
        factor = 'ROE'

        factor_data = self.engine.fetch_factor_range(universe,
                                                     factor,
                                                     dates=ref_dates)

        codes = self.engine.fetch_codes_range(universe, dates=ref_dates)
        groups = codes.groupby('trade_date')

        for ref_date, g in groups:
            query = select([Uqer.code, Uqer.ROE]).where(
                and_(Uqer.trade_date == ref_date,
                     Uqer.code.in_(g.code.unique().tolist())))

            df = pd.read_sql(query, con=self.engine.engine)
            calculated_factor = factor_data[factor_data.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_factor.ROE.values,
                                                 df.ROE.values)
Beispiel #16
0
    def test_sql_engine_fetch_dx_return_index_range(self):
        ref_dates = makeSchedule(
            advanceDateByCalendar('china.sse', self.ref_date, '-6m'),
            self.ref_date, '60b', 'china.sse')
        index_code = 906

        dx_return = self.engine.fetch_dx_return_index_range(index_code,
                                                            dates=ref_dates,
                                                            horizon=4,
                                                            offset=1)

        for ref_date in ref_dates:
            start_date = advanceDateByCalendar('china.sse', ref_date, '2b')
            end_date = advanceDateByCalendar('china.sse', ref_date, '6b')

            query = select([IndexMarket.indexCode, IndexMarket.chgPct]).where(
                and_(IndexMarket.trade_date.between(start_date, end_date),
                     IndexMarket.indexCode == index_code))

            df = pd.read_sql(query, con=self.engine.engine)
            res = df.groupby('indexCode').apply(lambda x: np.log(1. + x).sum())
            calculated_return = dx_return[dx_return.trade_date == ref_date]
            np.testing.assert_array_almost_equal(calculated_return.dx.values,
                                                 res.chgPct.values)
Beispiel #17
0
def fetch_predict_phase(engine,
                        alpha_factors: Iterable[object],
                        ref_date,
                        frequency,
                        universe,
                        batch,
                        neutralized_risk: Iterable[str] = None,
                        risk_model: str = 'short',
                        pre_process: Iterable[object] = None,
                        post_process: Iterable[object] = None,
                        warm_start: int = 0):
    transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).dropna()

    names = transformer.names

    if neutralized_risk:
        risk_df = engine.fetch_risk_model_range(universe,
                                                dates=dates,
                                                risk_model=risk_model)[1]
        used_neutralized_risk = list(set(neutralized_risk).difference(names))
        risk_df = risk_df[['trade_date', 'code'] +
                          used_neutralized_risk].dropna()
        train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code'])
        risk_exp = train_x[neutralized_risk].values.astype(float)
        x_values = train_x[names].values.astype(float)
    else:
        train_x = factor_df.copy()
        risk_exp = None

    date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
    dates = np.unique(date_label)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-1]
        start = dates[-batch]

        # index = (date_label >= start) & (date_label <= end)
        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
        this_raw_x = x_values[left_index:right_index]
        sub_dates = date_label[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
        else:
            this_risk_exp = None

        ne_x = factor_processing(this_raw_x,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)

        ne_x = ne_x[inner_left_index:inner_right_index]

        left_index = bisect.bisect_left(date_label, end)
        right_index = bisect.bisect_right(date_label, end)

        codes = train_x.code.values[left_index:right_index]
    else:
        ne_x = None
        codes = None

    ret = dict()
    ret['x_names'] = transformer.names
    ret['predict'] = {'x': ne_x, 'code': codes}

    return ret
Beispiel #18
0
def fetch_train_phase(engine,
                      alpha_factors: Iterable[object],
                      ref_date,
                      frequency,
                      universe,
                      batch,
                      neutralized_risk: Iterable[str] = None,
                      risk_model: str = 'short',
                      pre_process: Iterable[object] = None,
                      post_process: Iterable[object] = None,
                      warm_start: int = 0) -> dict:
    transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = _map_horizon(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)
    return_df = engine.fetch_dx_return_range(universe,
                                             dates=dates,
                                             horizon=horizon)

    df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()

    return_df, factor_df = df[['trade_date', 'code',
                               'dx']], df[['trade_date', 'code', 'isOpen'] +
                                          transformer.names]

    return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-2]
        start = dates[-batch - 1]
    else:
        end = dates[-1]
        start = dates[-batch]

    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
    this_raw_y = y_values[index]
    if risk_exp is not None:
        this_risk_exp = risk_exp[index]
    else:
        this_risk_exp = None

    ne_x = factor_processing(this_raw_x,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ne_y = factor_processing(this_raw_y,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ret = dict()
    ret['x_names'] = transformer.names
    ret['train'] = {'x': ne_x, 'y': ne_y}

    return ret
Beispiel #19
0

if __name__ == "__main__":
    from PyFin.api import makeSchedule
    # db_url = "mysql+mysqldb://reader:Reader#[email protected]:13317/vision?charset=utf8"
    db_url = "mysql+mysqldb://dxrw:[email protected]:13317/dxtest?charset=utf8"

    sql_engine = SqlEngine(db_url=db_url, factor_tables=["factor_momentum"])

    universe = Universe("hs300")
    start_date = '2020-01-02'
    end_date = '2020-02-21'
    frequency = "10b"
    benchmark = 300
    factors = ["EMA5D", "EMV6D"]
    ref_dates = makeSchedule(start_date, end_date, frequency, 'china.sse')
    print(ref_dates)
    df = sql_engine.fetch_factor("2020-02-21",
                                 factors=factors,
                                 codes=["2010031963"])
    print(df)
    df = sql_engine.fetch_factor_range(universe=universe,
                                       dates=ref_dates,
                                       factors=factors)
    print(df)
    df = sql_engine.fetch_codes_range(start_date=start_date,
                                      end_date=end_date,
                                      universe=Universe("hs300"))
    print(df)
    df = sql_engine.fetch_dx_return("2020-10-09",
                                    codes=["2010031963"],
Beispiel #20
0
def prepare_data(engine: SqlEngine,
                 factors: Union[Transformer, Iterable[object]],
                 start_date: str,
                 end_date: str,
                 frequency: str,
                 universe: Universe,
                 benchmark: int,
                 warm_start: int = 0,
                 fit_target: Union[Transformer, object] = None):
    if warm_start > 0:
        p = Period(frequency)
        p = Period(length=-warm_start * p.length(), units=p.units())
        start_date = advanceDateByCalendar('china.sse', start_date,
                                           p).strftime('%Y-%m-%d')

    dates = makeSchedule(start_date,
                         end_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Forward)

    dates = [d.strftime('%Y-%m-%d') for d in dates]

    horizon = map_freq(frequency)

    if isinstance(factors, Transformer):
        transformer = factors
    else:
        transformer = Transformer(factors)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).sort_values(
                                              ['trade_date', 'code'])
    alpha_logger.info("factor data loading finished")

    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))
    alpha_logger.info("fit target data loading finished")

    industry_df = engine.fetch_industry_range(universe, dates=dates)
    alpha_logger.info("industry data loading finished")
    benchmark_df = engine.fetch_benchmark_range(benchmark, dates=dates)
    alpha_logger.info("benchmark data loading finished")

    df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()
    df = pd.merge(df, benchmark_df, on=['trade_date', 'code'], how='left')
    df = pd.merge(df, industry_df, on=['trade_date', 'code'])
    df['weight'] = df['weight'].fillna(0.)
    df.dropna(inplace=True)

    return dates, df[[
        'trade_date', 'code', 'dx'
    ]], df[['trade_date', 'code', 'weight', 'industry_code', 'industry'] +
           transformer.names]
Beispiel #21
0
def fetch_predict_phase(engine,
                        alpha_factors: Union[Transformer, Iterable[object]],
                        ref_date,
                        frequency,
                        universe,
                        batch=1,
                        neutralized_risk: Iterable[str] = None,
                        risk_model: str = 'short',
                        pre_process: Iterable[object] = None,
                        post_process: Iterable[object] = None,
                        warm_start: int = 0,
                        fillna: str = None,
                        fit_target: Union[Transformer, object] = None):
    if isinstance(alpha_factors, Transformer):
        transformer = alpha_factors
    else:
        transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch - 1) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = map_freq(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)

    if fillna:
        factor_df = factor_df.groupby('trade_date').apply(
            lambda x: x.fillna(x.median())).reset_index(drop=True).dropna()
    else:
        factor_df = factor_df.dropna()

    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))

    names = transformer.names

    if neutralized_risk:
        risk_df = engine.fetch_risk_model_range(universe,
                                                dates=dates,
                                                risk_model=risk_model)[1]
        used_neutralized_risk = list(set(neutralized_risk).difference(names))
        risk_df = risk_df[['trade_date', 'code'] +
                          used_neutralized_risk].dropna()
        train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code'])
        train_x = pd.merge(train_x,
                           target_df,
                           on=['trade_date', 'code'],
                           how='left')
        risk_exp = train_x[neutralized_risk].values.astype(float)
    else:
        train_x = pd.merge(factor_df,
                           target_df,
                           on=['trade_date', 'code'],
                           how='left')
        risk_exp = None

    train_x.dropna(inplace=True, subset=train_x.columns[:-1])
    x_values = train_x[names].values.astype(float)
    y_values = train_x[['dx']].values.astype(float)

    date_label = pd.DatetimeIndex(train_x.trade_date).to_pydatetime()
    dates = np.unique(date_label)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-1]
        start = dates[-batch] if batch <= len(dates) else dates[0]

        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
        this_raw_x = x_values[left_index:right_index]
        this_raw_y = y_values[left_index:right_index]
        sub_dates = date_label[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
        else:
            this_risk_exp = None

        ne_x = factor_processing(this_raw_x,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        ne_y = factor_processing(this_raw_y,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)

        ne_x = ne_x[inner_left_index:inner_right_index]
        ne_y = ne_y[inner_left_index:inner_right_index]

        left_index = bisect.bisect_left(date_label, end)
        right_index = bisect.bisect_right(date_label, end)

        codes = train_x.code.values[left_index:right_index]
    else:
        ne_x = None
        ne_y = None
        codes = None

    ret = dict()
    ret['x_names'] = transformer.names
    ret['predict'] = {
        'x': pd.DataFrame(ne_x, columns=transformer.names),
        'code': codes,
        'y': ne_y.flatten()
    }

    return ret
Beispiel #22
0
def fetch_train_phase(engine,
                      alpha_factors: Union[Transformer, Iterable[object]],
                      ref_date,
                      frequency,
                      universe,
                      batch=1,
                      neutralized_risk: Iterable[str] = None,
                      risk_model: str = 'short',
                      pre_process: Iterable[object] = None,
                      post_process: Iterable[object] = None,
                      warm_start: int = 0,
                      fit_target: Union[Transformer, object] = None) -> dict:
    if isinstance(alpha_factors, Transformer):
        transformer = alpha_factors
    else:
        transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = map_freq(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)
    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))

    df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()

    target_df, factor_df = df[['trade_date', 'code',
                               'dx']], df[['trade_date', 'code'] +
                                          transformer.names]

    target_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
        _merge_df(engine, transformer.names, factor_df, target_df, universe, dates, risk_model,
                  neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        pyFinAssert(
            len(dates) >= 2, ValueError,
            "No previous data for training for the date {0}".format(ref_date))
        end = dates[-2]
        start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0]
    else:
        end = dates[-1]
        start = dates[-batch] if batch <= len(dates) else dates[0]

    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
    this_raw_y = y_values[index]
    this_code = codes[index]
    if risk_exp is not None:
        this_risk_exp = risk_exp[index]
    else:
        this_risk_exp = None

    ne_x = factor_processing(this_raw_x,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ne_y = factor_processing(this_raw_y,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ret = dict()
    ret['x_names'] = transformer.names
    ret['train'] = {
        'x': pd.DataFrame(ne_x, columns=transformer.names),
        'y': ne_y,
        'code': this_code
    }

    return ret