Ejemplo n.º 1
0
def batch_processing(x_values, y_values, groups, group_label, batch, risk_exp,
                     pre_process, post_process):
    train_x_buckets = {}
    train_y_buckets = {}
    predict_x_buckets = {}
    predict_y_buckets = {}

    for i, start in enumerate(groups[:-batch]):
        end = groups[i + batch]

        left_index = bisect.bisect_left(group_label, start)
        right_index = bisect.bisect_left(group_label, end)

        this_raw_x = x_values[left_index:right_index]
        this_raw_y = y_values[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
        else:
            this_risk_exp = None

        train_x_buckets[end] = factor_processing(this_raw_x,
                                                 pre_process=pre_process,
                                                 risk_factors=this_risk_exp,
                                                 post_process=post_process)

        train_y_buckets[end] = factor_processing(this_raw_y,
                                                 pre_process=pre_process,
                                                 risk_factors=this_risk_exp,
                                                 post_process=post_process)

        left_index = bisect.bisect_right(group_label, start)
        right_index = bisect.bisect_right(group_label, end)

        sub_dates = group_label[left_index:right_index]
        this_raw_x = x_values[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
        else:
            this_risk_exp = None

        ne_x = factor_processing(this_raw_x,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)
        predict_x_buckets[end] = ne_x[inner_left_index:inner_right_index]

        this_raw_y = y_values[left_index:right_index]
        if len(this_raw_y) > 0:
            ne_y = factor_processing(this_raw_y,
                                     pre_process=pre_process,
                                     risk_factors=this_risk_exp,
                                     post_process=post_process)
            predict_y_buckets[end] = ne_y[inner_left_index:inner_right_index]

    return train_x_buckets, train_y_buckets, predict_x_buckets, predict_y_buckets
Ejemplo n.º 2
0
def cs_impl(ref_date, factor_data, factor_name, risk_exposure, constraint_risk,
            industry_matrix, dx_returns):
    total_data = pd.merge(factor_data, risk_exposure, on='code')
    total_data = pd.merge(total_data, industry_matrix, on='code')
    total_data = total_data.replace([np.inf, -np.inf], np.nan).dropna()

    if len(total_data) < 0.33 * len(factor_data):
        alpha_logger.warning(
            f"valid data point({len(total_data)}) "
            f"is less than 33% of the total sample ({len(factor_data)}). Omit this run"
        )
        return np.nan, np.nan, np.nan

    total_risk_exp = total_data[constraint_risk]

    er = total_data[[factor_name]].values.astype(float)
    er = factor_processing(er, [winsorize_normal, standardize],
                           total_risk_exp.values, [standardize]).flatten()
    industry = total_data.industry_name.values

    codes = total_data.code.tolist()
    target_pos = pd.DataFrame({
        'code': codes,
        'weight': er,
        'industry': industry
    })
    target_pos['weight'] = target_pos['weight'] / target_pos['weight'].abs(
    ).sum()
    target_pos = pd.merge(target_pos, dx_returns, on=['code'])
    target_pos = pd.merge(target_pos,
                          total_data[['code'] + constraint_risk],
                          on=['code'])
    total_risk_exp = target_pos[constraint_risk]
    activate_weight = target_pos['weight'].values
    excess_return = np.exp(target_pos[['dx']].values) - 1.
    excess_return = factor_processing(
        excess_return, [winsorize_normal, standardize], total_risk_exp.values,
        [winsorize_normal, standardize]).flatten()
    port_ret = np.log(activate_weight @ excess_return + 1.)
    ic = np.corrcoef(excess_return, activate_weight)[0, 1]
    x = sm.add_constant(activate_weight)
    results = sm.OLS(excess_return, x).fit()
    t_stats = results.tvalues[1]

    alpha_logger.info(
        f"{ref_date} is finished with {len(target_pos)} stocks for {factor_name}"
    )
    alpha_logger.info(
        f"{ref_date} risk_exposure: "
        f"{np.sum(np.square(target_pos.weight.values @ target_pos[constraint_risk].values))}"
    )
    return port_ret, ic, t_stats
Ejemplo n.º 3
0
    def test_factor_processing(self):
        new_factor = factor_processing(self.raw_factor)
        np.testing.assert_array_almost_equal(new_factor, self.raw_factor)

        new_factor = factor_processing(self.raw_factor,
                                       pre_process=[standardize, winsorize_normal])

        np.testing.assert_array_almost_equal(new_factor, winsorize_normal(standardize(self.raw_factor)))

        new_factor = factor_processing(self.raw_factor,
                                       pre_process=[standardize, winsorize_normal],
                                       risk_factors=self.risk_factor)

        np.testing.assert_array_almost_equal(new_factor, neutralize(self.risk_factor,
                                                                    winsorize_normal(standardize(self.raw_factor))))
Ejemplo n.º 4
0
def cs_impl(ref_date,
            factor_data,
            factor_name,
            risk_exposure,
            constraint_risk,
            industry_matrix,
            dx_returns):
    total_data = pd.merge(factor_data, risk_exposure, on='code')
    total_data = pd.merge(total_data, industry_matrix, on='code').dropna()
    total_risk_exp = total_data[constraint_risk]

    er = total_data[factor_name].values.astype(float)
    er = factor_processing(er, [], total_risk_exp.values, []).flatten()
    industry = total_data.industry_name.values

    codes = total_data.code.tolist()
    target_pos = pd.DataFrame({'code': codes,
                               'weight': er,
                               'industry': industry})
    target_pos['weight'] = target_pos['weight'] / target_pos['weight'].abs().sum()
    target_pos = pd.merge(target_pos, dx_returns, on=['code'])
    target_pos = pd.merge(target_pos, total_data[['code'] + constraint_risk], on=['code'])
    activate_weight = target_pos.weight.values
    excess_return = np.exp(target_pos.dx.values) - 1.
    port_ret = np.log(activate_weight @ excess_return + 1.)
    ic = np.corrcoef(excess_return, activate_weight)[0, 1]
    x = sm.add_constant(activate_weight)
    results = sm.OLS(excess_return, x).fit()
    t_stats = results.tvalues[1]

    alpha_logger.info(f"{ref_date} is finished with {len(target_pos)} stocks for {factor_name}")
    alpha_logger.info(f"{ref_date} risk_exposure: "
                      f"{np.sum(np.square(target_pos.weight.values @ target_pos[constraint_risk].values))}")
    return port_ret, ic, t_stats
Ejemplo n.º 5
0
def factor_analysis(factors: pd.DataFrame,
                    factor_weights: np.ndarray,
                    industry: np.ndarray,
                    d1returns: np.ndarray = None,
                    detail_analysis=True,
                    benchmark: Optional[np.ndarray] = None,
                    risk_exp: Optional[np.ndarray] = None,
                    is_tradable: Optional[np.ndarray] = None,
                    constraints: Optional[Constraints] = None,
                    method='risk_neutral',
                    **kwargs) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]:
    if 'pre_process' in kwargs:
        pre_process = kwargs['pre_process']
        del kwargs['pre_process']
    else:
        pre_process = [winsorize_normal, standardize]

    if 'post_process' in kwargs:
        post_process = kwargs['post_process']
        del kwargs['post_process']
    else:
        post_process = [winsorize_normal, standardize]

    er = factor_processing(factors.values, pre_process, risk_exp,
                           post_process) @ factor_weights

    return er_portfolio_analysis(er, industry, d1returns, constraints,
                                 detail_analysis, benchmark, is_tradable,
                                 method, **kwargs)
Ejemplo n.º 6
0
def websim_weighted(factor_df, factor_list):
    total_data = factor_df.copy()
    risk_data = total_data[['code', 'trade_date'] + industry_styles + ['SIZE']]
    forward_returns = total_data[['code', 'trade_date', 'ret']]
    factor_data = total_data[factor_list]

    #等权合成
    ndiff_field = [
        i for i in list(set(total_data.columns)) if i not in factor_list
    ]
    #合成前数据预处理
    alpha_res = []
    grouped = total_data.groupby(['trade_date'])
    for k, g in grouped:
        ret_preprocess = factor_processing(
            g[factor_list].fillna(0).values,
            pre_process=[winsorize_normal, standardize])
        f = pd.DataFrame(ret_preprocess, columns=factor_list)
        for k in ndiff_field:
            f[k] = g[k].values
        alpha_res.append(f)
    alpha_data = pd.concat(alpha_res)
    alpha_data['conmbine'] = alpha_data[factor_list].mean(axis=1).values
    weight = Weighted()
    stats = weight.run(alpha_data, risk_data, forward_returns, 'conmbine')
    if abs(stats['fitness']) > 0.554246 and stats['sharpe'] > 1.243449:
        score = abs(stats['fitness'])
    else:
        score = abs(stats['fitness']) / 100
    return abs(score)
Ejemplo n.º 7
0
def prc_win_std(params):
    df = params[0]
    factor_name = params[1]
    ret_preprocess = factor_processing(
        df[[factor_name]].values,
        pre_process=[winsorize_normal, standardize],
    )
    df["prc_factor"] = ret_preprocess
    return df
Ejemplo n.º 8
0
    def fetch_dx_return(self,
                        ref_date: str,
                        codes: Iterable[int],
                        expiry_date: str = None,
                        horizon: int = 0,
                        offset: int = 0,
                        neutralized_risks: list = None,
                        pre_process=None,
                        post_process=None,
                        benchmark: int = None) -> pd.DataFrame:
        start_date = ref_date

        if not expiry_date:
            end_date = advanceDateByCalendar(
                'china.sse', ref_date,
                str(1 + horizon + offset + DAILY_RETURN_OFFSET) +
                'b').strftime('%Y-%m-%d')
        else:
            end_date = expiry_date

        query = select([
            Market.trade_date,
            Market.code.label("code"),
            Market.chgPct.label("chgPct")
        ]).where(
            and_(Market.trade_date.between(start_date, end_date),
                 Market.code.in_(codes),
                 Market.flag == 1)).order_by(Market.trade_date, Market.code)

        df = pd.read_sql(query, self.session.bind).dropna()
        df = self._create_stats(df, horizon, offset)
        df = df[df.trade_date == ref_date]

        if benchmark:
            benchmark = _map_index_codes[benchmark]
            query = select(
                [IndexMarket.trade_date,
                 IndexMarket.chgPct.label("chgPct")]).where(
                     and_(IndexMarket.trade_date.between(start_date, end_date),
                          IndexMarket.indexCode == benchmark,
                          IndexMarket.flag == 1))
            df2 = pd.read_sql(query, self.session.bind).dropna()
            df2 = self._create_stats(df2, horizon, offset, no_code=True)
            ind_ret = df2[df2.trade_date == ref_date]['dx'].values[0]
            df['dx'] = df['dx'] - ind_ret

        if neutralized_risks:
            _, risk_exp = self.fetch_risk_model(ref_date, codes)
            df = pd.merge(df, risk_exp, on='code').dropna()
            df[['dx']] = factor_processing(
                df[['dx']].values,
                pre_process=pre_process,
                risk_factors=df[neutralized_risks].values,
                post_process=post_process)
        return df[['code', 'dx']]
Ejemplo n.º 9
0
    def test_quantile_analysis_with_factor_processing(self):
        f_df = pd.DataFrame(self.x)
        calculated = quantile_analysis(f_df,
                                       self.x_w,
                                       self.r,
                                       n_bins=self.n_bins,
                                       risk_exp=self.risk_exp,
                                       pre_process=[winsorize_normal, standardize],
                                       post_process=[standardize])

        er = self.x_w @ factor_processing(self.x,
                                          [winsorize_normal, standardize],
                                          self.risk_exp,
                                          [standardize]).T
        expected = er_quantile_analysis(er, self.n_bins, self.r)
        np.testing.assert_array_almost_equal(calculated, expected)
Ejemplo n.º 10
0
    def test_quantile_analysis_with_benchmark(self):
        f_df = pd.DataFrame(self.x)
        calculated = quantile_analysis(f_df,
                                       self.x_w,
                                       self.r,
                                       n_bins=self.n_bins,
                                       do_neutralize=True,
                                       benchmark=self.b_w,
                                       risk_exp=self.risk_exp,
                                       pre_process=[winsorize_normal, standardize],
                                       post_process=[standardize])

        er = self.x_w @ factor_processing(self.x,
                                          [winsorize_normal, standardize],
                                          self.risk_exp,
                                          [standardize]).T
        raw_er = er_quantile_analysis(er, self.n_bins, self.r)
        expected = raw_er * self.b_w.sum() - np.dot(self.b_w, self.r)
        np.testing.assert_array_almost_equal(calculated, expected)
Ejemplo n.º 11
0
def quantile_analysis(factors: pd.DataFrame,
                      factor_weights: np.ndarray,
                      dx_return: np.ndarray,
                      n_bins: int = 5,
                      risk_exp: Optional[np.ndarray] = None,
                      **kwargs):
    if 'pre_process' in kwargs:
        pre_process = kwargs['pre_process']
        del kwargs['pre_process']
    else:
        pre_process = [winsorize_normal, standardize]

    if 'post_process' in kwargs:
        post_process = kwargs['post_process']
        del kwargs['post_process']
    else:
        post_process = [standardize]

    er = factor_processing(factors.values, pre_process, risk_exp, post_process) @ factor_weights
    return er_quantile_analysis(er, n_bins, dx_return, **kwargs)
Ejemplo n.º 12
0
    def fetch_dx_return(self,
                        ref_date: str,
                        codes: Iterable[int],
                        expiry_date: str = None,
                        horizon: int = 0,
                        offset: int = 0,
                        neutralized_risks: list = None,
                        pre_process=None,
                        post_process=None) -> pd.DataFrame:
        start_date = ref_date

        if not expiry_date:
            end_date = advanceDateByCalendar('china.sse', ref_date,
                                             str(1 + horizon + offset + DAILY_RETURN_OFFSET) + 'b').strftime('%Y%m%d')
        else:
            end_date = expiry_date

        stats = self._create_stats(Market, horizon, offset)

        query = select([Market.trade_date, Market.code, stats]).where(
            and_(
                Market.trade_date.between(start_date, end_date),
                Market.code.in_(codes)
            )
        )

        df = pd.read_sql(query, self.session.bind).dropna()
        df = df[df.trade_date == ref_date]

        if neutralized_risks:
            _, risk_exp = self.fetch_risk_model(ref_date, codes)
            df = pd.merge(df, risk_exp, on='code').dropna()
            df[['dx']] = factor_processing(df[['dx']].values,
                                           pre_process=pre_process,
                                           risk_factors=df[neutralized_risks].values,
                                           post_process=post_process)

        return df[['code', 'dx']].drop_duplicates(['code'])
Ejemplo n.º 13
0
def equal_combine(factor_df, factor_list):
    factor_df = factor_df.copy()
    ndiff_field = [
        i for i in list(set(factor_df.columns)) if i not in factor_list
    ]
    #合成前数据预处理
    alpha_res = []
    grouped = factor_df.groupby(['trade_date'])
    for k, g in grouped:
        ret_preprocess = factor_processing(
            g[factor_list].fillna(0).values,
            pre_process=[winsorize_normal, standardize])
        f = pd.DataFrame(ret_preprocess, columns=factor_list)
        for k in ndiff_field:
            f[k] = g[k].values
        alpha_res.append(f)
    total_data = pd.concat(alpha_res)
    total_data = factor_df
    total_data['conmbine'] = total_data[factor_list].mean(axis=1).values
    score = np.corrcoef(total_data['conmbine'].fillna(0).values,
                        total_data['ret'].fillna(0).values)[0, 1]
    #score = abs(total_data['conmbine'].mean()) / 100
    return abs(score)
Ejemplo n.º 14
0
    def run(self, running_setting):
        alpha_logger.info("starting backting ...")
        total_data_groups = self.total_data.groupby('trade_date')

        rets = []
        turn_overs = []
        leverags = []
        previous_pos = pd.DataFrame()
        executor = copy.deepcopy(running_setting.executor)
        positions = pd.DataFrame()

        if self.dask_client is None:
            models = {}
            for ref_date, _ in total_data_groups:
                models[ref_date] = train_model(ref_date.strftime('%Y-%m-%d'),
                                               self.alpha_model,
                                               self.data_meta)
        else:

            def worker(parameters):
                new_model = train_model(parameters[0].strftime('%Y-%m-%d'),
                                        parameters[1], parameters[2])
                return parameters[0], new_model

            l = self.dask_client.map(worker,
                                     [(d[0], self.alpha_model, self.data_meta)
                                      for d in total_data_groups])
            results = self.dask_client.gather(l)
            models = dict(results)

        for ref_date, this_data in total_data_groups:
            new_model = models[ref_date]
            codes = this_data.code.values.tolist()

            if previous_pos.empty:
                current_position = None
            else:
                previous_pos.set_index('code', inplace=True)
                remained_pos = previous_pos.loc[codes]

                remained_pos.fillna(0., inplace=True)
                current_position = remained_pos.weight.values

            if running_setting.rebalance_method == 'tv':
                risk_cov = self.total_risk_cov[self.total_risk_cov.trade_date
                                               == ref_date]
                sec_cov = self._generate_sec_cov(this_data, risk_cov)
            else:
                sec_cov = None

            benchmark_w = this_data.weight.values
            constraints = LinearConstraints(running_setting.bounds, this_data,
                                            benchmark_w)

            lbound, ubound = self._create_lu_bounds(running_setting, codes,
                                                    benchmark_w)

            features = new_model.features
            dfs = []
            for name in features:
                data_cleaned = this_data.dropna(subset=[name])
                raw_factors = data_cleaned[[name]].values
                new_factors = factor_processing(
                    raw_factors,
                    pre_process=self.data_meta.pre_process,
                    risk_factors=data_cleaned[
                        self.data_meta.neutralized_risk].values.astype(float)
                    if self.data_meta.neutralized_risk else None,
                    post_process=self.data_meta.post_process)
                df = pd.DataFrame(new_factors,
                                  columns=[name],
                                  index=data_cleaned.code)
                dfs.append(df)

            new_factors = pd.concat(dfs, axis=1)
            new_factors = new_factors.loc[codes].fillna(new_factors.median())
            er = new_model.predict(new_factors).astype(float)

            alpha_logger.info('{0} re-balance: {1} codes'.format(
                ref_date, len(er)))
            target_pos = self._calculate_pos(running_setting,
                                             er,
                                             this_data,
                                             constraints,
                                             benchmark_w,
                                             lbound,
                                             ubound,
                                             sec_cov=sec_cov,
                                             current_position=current_position)

            target_pos['code'] = codes
            target_pos['trade_date'] = ref_date

            turn_over, executed_pos = executor.execute(target_pos=target_pos)
            leverage = executed_pos.weight.abs().sum()

            ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) -
                                                1.)
            rets.append(np.log(1. + ret))
            executor.set_current(executed_pos)
            turn_overs.append(turn_over)
            leverags.append(leverage)
            positions = positions.append(executed_pos)
            previous_pos = executed_pos

        positions['benchmark_weight'] = self.total_data['weight'].values
        positions['dx'] = self.total_data.dx.values

        trade_dates = positions.trade_date.unique()
        ret_df = pd.DataFrame(
            {
                'returns': rets,
                'turn_over': turn_overs,
                'leverage': leverags
            },
            index=trade_dates)

        ret_df['benchmark_returns'] = self.index_return['dx']
        ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1],
                                         self.freq)] = 0.
        ret_df = ret_df.shift(1)
        ret_df.iloc[0] = 0.
        ret_df['excess_return'] = ret_df[
            'returns'] - ret_df['benchmark_returns'] * ret_df['leverage']

        return ret_df, positions
Ejemplo n.º 15
0
def fetch_train_phase(engine,
                      alpha_factors: Iterable[object],
                      ref_date,
                      frequency,
                      universe,
                      batch,
                      neutralized_risk: Iterable[str] = None,
                      risk_model: str = 'short',
                      pre_process: Iterable[object] = None,
                      post_process: Iterable[object] = None,
                      warm_start: int = 0) -> dict:
    transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch + 1) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = _map_horizon(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)
    return_df = engine.fetch_dx_return_range(universe,
                                             dates=dates,
                                             horizon=horizon)

    df = pd.merge(factor_df, return_df, on=['trade_date', 'code']).dropna()

    return_df, factor_df = df[['trade_date', 'code',
                               'dx']], df[['trade_date', 'code', 'isOpen'] +
                                          transformer.names]

    return_df, dates, date_label, risk_exp, x_values, y_values, _, _ = \
        _merge_df(engine, transformer.names, factor_df, return_df, universe, dates, risk_model, neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-2]
        start = dates[-batch - 1]
    else:
        end = dates[-1]
        start = dates[-batch]

    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
    this_raw_y = y_values[index]
    if risk_exp is not None:
        this_risk_exp = risk_exp[index]
    else:
        this_risk_exp = None

    ne_x = factor_processing(this_raw_x,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ne_y = factor_processing(this_raw_y,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ret = dict()
    ret['x_names'] = transformer.names
    ret['train'] = {'x': ne_x, 'y': ne_y}

    return ret
Ejemplo n.º 16
0
def fetch_predict_phase(engine,
                        alpha_factors: Iterable[object],
                        ref_date,
                        frequency,
                        universe,
                        batch,
                        neutralized_risk: Iterable[str] = None,
                        risk_model: str = 'short',
                        pre_process: Iterable[object] = None,
                        post_process: Iterable[object] = None,
                        warm_start: int = 0):
    transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates).dropna()

    names = transformer.names

    if neutralized_risk:
        risk_df = engine.fetch_risk_model_range(universe,
                                                dates=dates,
                                                risk_model=risk_model)[1]
        used_neutralized_risk = list(set(neutralized_risk).difference(names))
        risk_df = risk_df[['trade_date', 'code'] +
                          used_neutralized_risk].dropna()
        train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code'])
        risk_exp = train_x[neutralized_risk].values.astype(float)
        x_values = train_x[names].values.astype(float)
    else:
        train_x = factor_df.copy()
        risk_exp = None

    date_label = pd.DatetimeIndex(factor_df.trade_date).to_pydatetime()
    dates = np.unique(date_label)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-1]
        start = dates[-batch]

        # index = (date_label >= start) & (date_label <= end)
        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
        this_raw_x = x_values[left_index:right_index]
        sub_dates = date_label[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
        else:
            this_risk_exp = None

        ne_x = factor_processing(this_raw_x,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)

        ne_x = ne_x[inner_left_index:inner_right_index]

        left_index = bisect.bisect_left(date_label, end)
        right_index = bisect.bisect_right(date_label, end)

        codes = train_x.code.values[left_index:right_index]
    else:
        ne_x = None
        codes = None

    ret = dict()
    ret['x_names'] = transformer.names
    ret['predict'] = {'x': ne_x, 'code': codes}

    return ret
Ejemplo n.º 17
0
    total_data = pickle.load(file2)

total_data = total_data.sort_values(by=['trade_date', 'code'], ascending=True)

diff_filed = ['trade_date', 'code', 'ret']
factor_columns = [
    i for i in list(set(total_data.columns))
    if i not in ['trade_date', 'code', 'ret']
]

#所有因子数据做去极值和标准化处理
alpha_res = []
grouped = total_data.groupby(['trade_date'])
for k, g in grouped:
    ret_preprocess = factor_processing(
        g[factor_columns].fillna(0).values,
        pre_process=[winsorize_normal, standardize])
    f = pd.DataFrame(ret_preprocess, columns=factor_columns)
    for k in diff_filed:
        f[k] = g[k].values
    alpha_res.append(f)
total_data = pd.concat(alpha_res)

point = int(np.random.uniform(0, len(factor_columns)) / 2)
ori_field = factor_columns[:point]
add_field = factor_columns[point:]

#best_code, best_field
best_code, best_field = mutation_factors.genetic_run(total_data,
                                                     diff_filed=diff_filed,
                                                     strong_field=ori_field,
Ejemplo n.º 18
0
def fetch_predict_phase(engine,
                        alpha_factors: Union[Transformer, Iterable[object]],
                        ref_date,
                        frequency,
                        universe,
                        batch=1,
                        neutralized_risk: Iterable[str] = None,
                        risk_model: str = 'short',
                        pre_process: Iterable[object] = None,
                        post_process: Iterable[object] = None,
                        warm_start: int = 0,
                        fillna: str = None,
                        fit_target: Union[Transformer, object] = None):
    if isinstance(alpha_factors, Transformer):
        transformer = alpha_factors
    else:
        transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch - 1) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = map_freq(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)

    if fillna:
        factor_df = factor_df.groupby('trade_date').apply(
            lambda x: x.fillna(x.median())).reset_index(drop=True).dropna()
    else:
        factor_df = factor_df.dropna()

    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))

    names = transformer.names

    if neutralized_risk:
        risk_df = engine.fetch_risk_model_range(universe,
                                                dates=dates,
                                                risk_model=risk_model)[1]
        used_neutralized_risk = list(set(neutralized_risk).difference(names))
        risk_df = risk_df[['trade_date', 'code'] +
                          used_neutralized_risk].dropna()
        train_x = pd.merge(factor_df, risk_df, on=['trade_date', 'code'])
        train_x = pd.merge(train_x,
                           target_df,
                           on=['trade_date', 'code'],
                           how='left')
        risk_exp = train_x[neutralized_risk].values.astype(float)
    else:
        train_x = pd.merge(factor_df,
                           target_df,
                           on=['trade_date', 'code'],
                           how='left')
        risk_exp = None

    train_x.dropna(inplace=True, subset=train_x.columns[:-1])
    x_values = train_x[names].values.astype(float)
    y_values = train_x[['dx']].values.astype(float)

    date_label = pd.DatetimeIndex(train_x.trade_date).to_pydatetime()
    dates = np.unique(date_label)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        end = dates[-1]
        start = dates[-batch] if batch <= len(dates) else dates[0]

        left_index = bisect.bisect_left(date_label, start)
        right_index = bisect.bisect_right(date_label, end)
        this_raw_x = x_values[left_index:right_index]
        this_raw_y = y_values[left_index:right_index]
        sub_dates = date_label[left_index:right_index]

        if risk_exp is not None:
            this_risk_exp = risk_exp[left_index:right_index]
        else:
            this_risk_exp = None

        ne_x = factor_processing(this_raw_x,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        ne_y = factor_processing(this_raw_y,
                                 pre_process=pre_process,
                                 risk_factors=this_risk_exp,
                                 post_process=post_process)

        inner_left_index = bisect.bisect_left(sub_dates, end)
        inner_right_index = bisect.bisect_right(sub_dates, end)

        ne_x = ne_x[inner_left_index:inner_right_index]
        ne_y = ne_y[inner_left_index:inner_right_index]

        left_index = bisect.bisect_left(date_label, end)
        right_index = bisect.bisect_right(date_label, end)

        codes = train_x.code.values[left_index:right_index]
    else:
        ne_x = None
        ne_y = None
        codes = None

    ret = dict()
    ret['x_names'] = transformer.names
    ret['predict'] = {
        'x': pd.DataFrame(ne_x, columns=transformer.names),
        'code': codes,
        'y': ne_y.flatten()
    }

    return ret
Ejemplo n.º 19
0
def fetch_train_phase(engine,
                      alpha_factors: Union[Transformer, Iterable[object]],
                      ref_date,
                      frequency,
                      universe,
                      batch=1,
                      neutralized_risk: Iterable[str] = None,
                      risk_model: str = 'short',
                      pre_process: Iterable[object] = None,
                      post_process: Iterable[object] = None,
                      warm_start: int = 0,
                      fit_target: Union[Transformer, object] = None) -> dict:
    if isinstance(alpha_factors, Transformer):
        transformer = alpha_factors
    else:
        transformer = Transformer(alpha_factors)

    p = Period(frequency)
    p = Period(length=-(warm_start + batch) * p.length(), units=p.units())

    start_date = advanceDateByCalendar('china.sse', ref_date, p,
                                       BizDayConventions.Following)
    dates = makeSchedule(start_date,
                         ref_date,
                         frequency,
                         calendar='china.sse',
                         dateRule=BizDayConventions.Following,
                         dateGenerationRule=DateGeneration.Backward)

    horizon = map_freq(frequency)

    factor_df = engine.fetch_factor_range(universe,
                                          factors=transformer,
                                          dates=dates)
    if fit_target is None:
        target_df = engine.fetch_dx_return_range(universe,
                                                 dates=dates,
                                                 horizon=horizon)
    else:
        one_more_date = advanceDateByCalendar('china.sse', dates[-1],
                                              frequency)
        target_df = engine.fetch_factor_range_forward(universe,
                                                      factors=fit_target,
                                                      dates=dates +
                                                      [one_more_date])
        target_df = target_df[target_df.trade_date.isin(dates)]
        target_df = target_df.groupby('code').apply(
            lambda x: x.fillna(method='pad'))

    df = pd.merge(factor_df, target_df, on=['trade_date', 'code']).dropna()

    target_df, factor_df = df[['trade_date', 'code',
                               'dx']], df[['trade_date', 'code'] +
                                          transformer.names]

    target_df, dates, date_label, risk_exp, x_values, y_values, _, _, codes = \
        _merge_df(engine, transformer.names, factor_df, target_df, universe, dates, risk_model,
                  neutralized_risk)

    if dates[-1] == dt.datetime.strptime(ref_date, '%Y-%m-%d'):
        pyFinAssert(
            len(dates) >= 2, ValueError,
            "No previous data for training for the date {0}".format(ref_date))
        end = dates[-2]
        start = dates[-batch - 1] if batch <= len(dates) - 1 else dates[0]
    else:
        end = dates[-1]
        start = dates[-batch] if batch <= len(dates) else dates[0]

    index = (date_label >= start) & (date_label <= end)
    this_raw_x = x_values[index]
    this_raw_y = y_values[index]
    this_code = codes[index]
    if risk_exp is not None:
        this_risk_exp = risk_exp[index]
    else:
        this_risk_exp = None

    ne_x = factor_processing(this_raw_x,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ne_y = factor_processing(this_raw_y,
                             pre_process=pre_process,
                             risk_factors=this_risk_exp,
                             post_process=post_process)

    ret = dict()
    ret['x_names'] = transformer.names
    ret['train'] = {
        'x': pd.DataFrame(ne_x, columns=transformer.names),
        'y': ne_y,
        'code': this_code
    }

    return ret
Ejemplo n.º 20
0
    def run(self, running_setting):
        alpha_logger.info("starting backting ...")
        total_data_groups = self.total_data.groupby('trade_date')

        rets = []
        turn_overs = []
        leverags = []
        previous_pos = pd.DataFrame()
        executor = copy.deepcopy(running_setting.executor)
        positions = pd.DataFrame()

        if self.alpha_models is None:
            self.prepare_backtest_models()

        for ref_date, this_data in total_data_groups:
            risk_model = self.risk_models[ref_date]
            new_model = self.alpha_models[ref_date]
            codes = this_data.code.values.tolist()

            if previous_pos.empty:
                current_position = None
            else:
                previous_pos.set_index('code', inplace=True)
                remained_pos = previous_pos.reindex(codes)

                remained_pos.fillna(0., inplace=True)
                current_position = remained_pos.weight.values

            benchmark_w = this_data.weight.values
            constraints = LinearConstraints(running_setting.bounds, this_data,
                                            benchmark_w)

            lbound, ubound = self._create_lu_bounds(running_setting, codes,
                                                    benchmark_w)

            this_data.fillna(0, inplace=True)
            new_factors = factor_processing(
                this_data[new_model.features].values,
                pre_process=self.data_meta.pre_process,
                risk_factors=this_data[self.data_meta.neutralized_risk].values.
                astype(float) if self.data_meta.neutralized_risk else None,
                post_process=self.data_meta.post_process)
            new_factors = pd.DataFrame(new_factors,
                                       columns=new_model.features,
                                       index=codes)
            er = new_model.predict(new_factors).astype(float)

            alpha_logger.info('{0} re-balance: {1} codes'.format(
                ref_date, len(er)))
            target_pos = self._calculate_pos(
                running_setting,
                er,
                this_data,
                constraints,
                benchmark_w,
                lbound,
                ubound,
                risk_model=risk_model.get_risk_profile(codes),
                current_position=current_position)

            target_pos['code'] = codes
            target_pos['trade_date'] = ref_date

            turn_over, executed_pos = executor.execute(target_pos=target_pos)
            leverage = executed_pos.weight.abs().sum()

            ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) -
                                                1.)
            rets.append(np.log(1. + ret))
            executor.set_current(executed_pos)
            turn_overs.append(turn_over)
            leverags.append(leverage)
            positions = positions.append(executed_pos)
            previous_pos = executed_pos

        positions['benchmark_weight'] = self.total_data['weight'].values
        positions['dx'] = self.total_data.dx.values

        trade_dates = positions.trade_date.unique()
        ret_df = pd.DataFrame(
            {
                'returns': rets,
                'turn_over': turn_overs,
                'leverage': leverags
            },
            index=trade_dates)

        ret_df['benchmark_returns'] = self.index_return['dx']
        ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1],
                                         self.freq)] = 0.
        ret_df = ret_df.shift(1)
        ret_df.iloc[0] = 0.
        ret_df['excess_return'] = ret_df[
            'returns'] - ret_df['benchmark_returns'] * ret_df['leverage']
        return ret_df, positions
Ejemplo n.º 21
0
    def run(self):
        alpha_logger.info("starting backting ...")

        total_factors = self.engine.fetch_factor_range(
            self.running_setting.universe,
            self.alpha_model.formulas,
            dates=self.running_setting.dates)
        alpha_logger.info("alpha factor data loading finished ...")

        total_industry = self.engine.fetch_industry_matrix_range(
            self.running_setting.universe,
            dates=self.running_setting.dates,
            category=self.running_setting.industry_cat,
            level=self.running_setting.industry_level)
        alpha_logger.info("industry data loading finished ...")

        total_benchmark = self.engine.fetch_benchmark_range(
            dates=self.running_setting.dates,
            benchmark=self.running_setting.benchmark)
        alpha_logger.info("benchmark data loading finished ...")

        total_risk_cov, total_risk_exposure = self.engine.fetch_risk_model_range(
            self.running_setting.universe,
            dates=self.running_setting.dates,
            risk_model=self.data_meta.risk_model)
        alpha_logger.info("risk_model data loading finished ...")

        total_returns = self.engine.fetch_dx_return_range(
            self.running_setting.universe,
            dates=self.running_setting.dates,
            horizon=self.running_setting.horizon,
            offset=1)
        alpha_logger.info("returns data loading finished ...")

        total_data = pd.merge(total_factors,
                              total_industry,
                              on=['trade_date', 'code'])
        total_data = pd.merge(total_data,
                              total_benchmark,
                              on=['trade_date', 'code'],
                              how='left')
        total_data.fillna({'weight': 0.}, inplace=True)
        total_data = pd.merge(total_data,
                              total_returns,
                              on=['trade_date', 'code'])
        total_data = pd.merge(total_data,
                              total_risk_exposure,
                              on=['trade_date', 'code'])

        is_in_benchmark = (total_data.weight > 0.).astype(float).reshape(
            (-1, 1))
        total_data.loc[:, 'benchmark'] = is_in_benchmark
        total_data.loc[:, 'total'] = np.ones_like(is_in_benchmark)
        total_data.sort_values(['trade_date', 'code'], inplace=True)
        total_data_groups = total_data.groupby('trade_date')

        rets = []
        turn_overs = []
        leverags = []
        previous_pos = pd.DataFrame()
        executor = copy.deepcopy(self.running_setting.executor)
        positions = pd.DataFrame()

        if self.dask_client is None:
            models = {}
            for ref_date, _ in total_data_groups:
                models[ref_date] = train_model(ref_date.strftime('%Y-%m-%d'),
                                               self.alpha_model,
                                               self.data_meta)
        else:

            def worker(parameters):
                new_model = train_model(parameters[0].strftime('%Y-%m-%d'),
                                        parameters[1], parameters[2])
                return parameters[0], new_model

            l = self.dask_client.map(worker,
                                     [(d[0], self.alpha_model, self.data_meta)
                                      for d in total_data_groups])
            results = self.dask_client.gather(l)
            models = dict(results)

        for ref_date, this_data in total_data_groups:
            new_model = models[ref_date]

            this_data = this_data.fillna(
                this_data[new_model.features].median())
            codes = this_data.code.values.tolist()

            if self.running_setting.rebalance_method == 'tv':
                risk_cov = total_risk_cov[total_risk_cov.trade_date ==
                                          ref_date]
                sec_cov = self._generate_sec_cov(this_data, risk_cov)
            else:
                sec_cov = None

            benchmark_w = this_data.weight.values
            constraints = LinearConstraints(self.running_setting.bounds,
                                            this_data, benchmark_w)

            lbound = np.maximum(
                0., benchmark_w - self.running_setting.weights_bandwidth)
            ubound = self.running_setting.weights_bandwidth + benchmark_w

            if previous_pos.empty:
                current_position = None
            else:
                previous_pos.set_index('code', inplace=True)
                remained_pos = previous_pos.loc[codes]

                remained_pos.fillna(0., inplace=True)
                current_position = remained_pos.weight.values

            features = new_model.features
            raw_factors = this_data[features].values
            new_factors = factor_processing(
                raw_factors,
                pre_process=self.data_meta.pre_process,
                risk_factors=this_data[self.data_meta.neutralized_risk].values.
                astype(float) if self.data_meta.neutralized_risk else None,
                post_process=self.data_meta.post_process)

            er = new_model.predict(pd.DataFrame(
                new_factors, columns=features)).astype(float)

            alpha_logger.info('{0} re-balance: {1} codes'.format(
                ref_date, len(er)))
            target_pos = self._calculate_pos(er,
                                             this_data,
                                             constraints,
                                             benchmark_w,
                                             lbound,
                                             ubound,
                                             sec_cov=sec_cov,
                                             current_position=current_position,
                                             **self.running_setting.more_opts)

            target_pos['code'] = codes
            target_pos['trade_date'] = ref_date

            turn_over, executed_pos = executor.execute(target_pos=target_pos)
            leverage = executed_pos.weight.abs().sum()

            ret = executed_pos.weight.values @ (np.exp(this_data.dx.values) -
                                                1.)
            rets.append(np.log(1. + ret))
            executor.set_current(executed_pos)
            turn_overs.append(turn_over)
            leverags.append(leverage)
            positions = positions.append(executed_pos)
            previous_pos = executed_pos

        positions['benchmark_weight'] = total_data['weight'].values
        positions['dx'] = total_data.dx.values

        trade_dates = positions.trade_date.unique()
        ret_df = pd.DataFrame(
            {
                'returns': rets,
                'turn_over': turn_overs,
                'leverage': leverags
            },
            index=trade_dates)

        index_return = self.engine.fetch_dx_return_index_range(
            self.running_setting.benchmark,
            dates=self.running_setting.dates,
            horizon=self.running_setting.horizon,
            offset=1).set_index('trade_date')
        ret_df['benchmark_returns'] = index_return['dx']
        ret_df.loc[advanceDateByCalendar('china.sse', ret_df.index[-1],
                                         self.running_setting.freq)] = 0.
        ret_df = ret_df.shift(1)
        ret_df.iloc[0] = 0.
        ret_df['excess_return'] = ret_df[
            'returns'] - ret_df['benchmark_returns'] * ret_df['leverage']

        return ret_df, positions