def test_neutralize_explain_output(self): y = self.y[:, 0].flatten() calc_res, other_stats = neutralize(self.x, y, detail=True) model = LinearRegression(fit_intercept=False) model.fit(self.x, y) exp_res = y - self.x @ model.coef_.T exp_explained = self.x * model.coef_.T np.testing.assert_array_almost_equal(calc_res, exp_res.reshape(-1, 1)) np.testing.assert_array_almost_equal(other_stats['explained'][:, :, 0], exp_explained) calc_res, other_stats = neutralize(self.x, self.y, detail=True) model = LinearRegression(fit_intercept=False) model.fit(self.x, self.y) exp_res = self.y - self.x @ model.coef_.T np.testing.assert_array_almost_equal(calc_res, exp_res) for i in range(self.y.shape[1]): exp_explained = self.x * model.coef_.T[:, i] np.testing.assert_array_almost_equal( other_stats['explained'][:, :, i], exp_explained)
def test_neutralize_explain_output_with_group(self): y = self.y[:, 0].flatten() calc_res, other_stats = neutralize(self.x, y, self.groups, detail=True) model = LinearRegression(fit_intercept=False) for i in range(30): curr_x = self.x[self.groups == i] curr_y = y[self.groups == i] model.fit(curr_x, curr_y) exp_res = curr_y - curr_x @ model.coef_.T exp_explained = curr_x * model.coef_.T np.testing.assert_array_almost_equal(calc_res[self.groups == i], exp_res.reshape(-1, 1)) np.testing.assert_array_almost_equal(other_stats['explained'][self.groups == i, :, 0], exp_explained) calc_res, other_stats = neutralize(self.x, self.y, self.groups, detail=True) model = LinearRegression(fit_intercept=False) for i in range(30): curr_x = self.x[self.groups == i] curr_y = self.y[self.groups == i] model.fit(curr_x, curr_y) exp_res = curr_y - curr_x @ model.coef_.T np.testing.assert_array_almost_equal(calc_res[self.groups == i], exp_res) for j in range(self.y.shape[1]): exp_explained = curr_x * model.coef_.T[:, j] np.testing.assert_array_almost_equal( other_stats['explained'][self.groups == i, :, j], exp_explained)
def process_data(total_data, factor_cols, risk_cols): risk_values = total_data[risk_cols].values factor_values = total_data[factor_cols].values processed_values = np.zeros(factor_values.shape) for i in range(processed_values.shape[1]): try: processed_values[:, i] = neutralize(risk_values, standardize(winsorize_normal(factor_values[:, [i]]))).flatten() except np.linalg.linalg.LinAlgError: processed_values[:, i] = neutralize(risk_values, winsorize_normal(factor_values[:, [i]])).flatten() return processed_values
def calc_decay(self, factors, decay_interval=5): interval = decay_interval + 1 decay_dict = {} for column in self._columns: factors_names = [] factors_list = [] values = {} grouped = factors.groupby(by='code') for k, group in grouped: group = group.sort_values(by='trade_date', ascending=True) for i in range(1, interval): group[str(i) + '_' + column] = group[column].shift(0+i) factors_list += group[-interval:].to_dict(orient='records') new_factors_sets = pd.DataFrame(factors_list) for i in range(1, interval): factors_names.append(str(i) + '_' + column) industry_dummy = pd.get_dummies(new_factors_sets.indexSymbol) neutralized_factors = neutralize(industry_dummy.values.astype(float), new_factors_sets[factors_names].values, groups=new_factors_sets['trade_date']) new_factors_sets[factors_names] = neutralized_factors for f in factors_names: ic_series = new_factors_sets.groupby('trade_date').apply(lambda x: np.corrcoef(x[f].fillna(0), x['chgPct'])[0, 1]) values[f] = ic_series.mean() values = pd.DataFrame([values]) values.columns=['q' + str(i) for i in range(1, decay_interval+1)] decay_dict[column] = values return decay_dict
def benchmark_neutralize_with_groups(n_samples: int, n_features: int, n_loops: int, n_groups: int) -> None: print("-" * 60) print("Starting least square fitting with group benchmarking") print( "Parameters(n_samples: {0}, n_features: {1}, n_loops: {2}, n_groups: {3})" .format(n_samples, n_features, n_loops, n_groups)) y = np.random.randn(n_samples, 5) x = np.random.randn(n_samples, n_features) groups = np.random.randint(n_groups, size=n_samples) start = dt.datetime.now() for _ in range(n_loops): _ = neutralize(x, y, groups) impl_model_time = dt.datetime.now() - start print('{0:20s}: {1}'.format('Implemented model', impl_model_time)) start = dt.datetime.now() model = LinearRegression(fit_intercept=False) for _ in range(n_loops): for i in range(n_groups): curr_x = x[groups == i] curr_y = y[groups == i] model.fit(curr_x, curr_y) _ = curr_y - curr_x @ model.coef_.T benchmark_model_time = dt.datetime.now() - start print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
def benchmark_neutralize(n_samples: int, n_features: int, n_loops: int) -> None: print("-" * 60) print("Starting least square fitting benchmarking") print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format( n_samples, n_features, n_loops)) y = np.random.randn(n_samples, 5) x = np.random.randn(n_samples, n_features) start = dt.datetime.now() for _ in range(n_loops): calc_res = neutralize(x, y) impl_model_time = dt.datetime.now() - start print('{0:20s}: {1}'.format('Implemented model', impl_model_time)) start = dt.datetime.now() for _ in range(n_loops): benchmark_model = LinearRegression(fit_intercept=False) benchmark_model.fit(x, y) exp_res = y - x @ benchmark_model.coef_.T benchmark_model_time = dt.datetime.now() - start print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time)) np.testing.assert_array_almost_equal(calc_res, exp_res)
def risk_analysis( net_weight_series: pd.Series, next_bar_return_series: pd.Series, risk_table: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: group_idx = net_weight_series.index.values.astype(int) net_pos = net_weight_series.values.reshape((-1, 1)) risk_factor_cols = risk_table.columns idiosyncratic, other_stats = neutralize(risk_table.values, next_bar_return_series.values, group_idx, detail=True) systematic = other_stats['explained'] exposure = other_stats['exposure'] explained_table = np.hstack((idiosyncratic, systematic[:, :, 0])) cols = ['idiosyncratic'] cols.extend(risk_factor_cols) explained_table = pd.DataFrame(explained_table * net_pos, columns=cols, index=net_weight_series.index) exposure_table = pd.DataFrame(exposure[:, :, 0] * net_pos, columns=risk_factor_cols, index=net_weight_series.index) return explained_table, exposure_table.groupby(level=0).first()
def test_neutralize(self): calc_res = neutralize(self.x, self.y) model = LinearRegression(fit_intercept=False) model.fit(self.x, self.y) exp_res = self.y - self.x @ model.coef_.T np.testing.assert_array_almost_equal(calc_res, exp_res)
def test_neutralize_with_group(self): calc_res = neutralize(self.x, self.y, self.groups) model = LinearRegression(fit_intercept=False) for i in range(30): curr_x = self.x[self.groups == i] curr_y = self.y[self.groups == i] model.fit(curr_x, curr_y) exp_res = curr_y - curr_x @ model.coef_.T np.testing.assert_array_almost_equal(calc_res[self.groups == i], exp_res)
def factor_processing(raw_factor: np.ndarray, pre_process: Optional[List] = None, risk_factors: Optional[np.ndarray] = None) -> np.ndarray: new_factor = raw_factor if pre_process: for p in pre_process: new_factor = p(new_factor) if risk_factors is not None: new_factor = neutralize(risk_factors, new_factor) return new_factor
def test_factor_processing(self): new_factor = factor_processing(self.raw_factor) np.testing.assert_array_almost_equal(new_factor, self.raw_factor) new_factor = factor_processing(self.raw_factor, pre_process=[standardize, winsorize_normal]) np.testing.assert_array_almost_equal(new_factor, winsorize_normal(standardize(self.raw_factor))) new_factor = factor_processing(self.raw_factor, pre_process=[standardize, winsorize_normal], risk_factors=self.risk_factor) np.testing.assert_array_almost_equal(new_factor, neutralize(self.risk_factor, winsorize_normal(standardize(self.raw_factor))))
def factor_processing(raw_factors: np.ndarray, pre_process: Optional[List] = None, risk_factors: Optional[np.ndarray] = None, post_process: Optional[List] = None, groups=None) -> np.ndarray: new_factors = raw_factors if pre_process: for p in pre_process: new_factors = p(new_factors, groups=groups) if risk_factors is not None: risk_factors = risk_factors[:, risk_factors.sum(axis=0) != 0] new_factors = neutralize(risk_factors, new_factors, groups=groups) if post_process: for p in post_process: new_factors = p(new_factors, groups=groups) return new_factors
def on_factor_processing(self, new_factors_sets, columns = []): calc_columns = columns if len(columns) > 0 else self._columns ### 根据因子种类不同,做nan处理,基本面因子(成长,价值,质量)采用行业中值处理,其他以0处理, #暂时以0处理 for column in calc_columns: new_factors_sets[column] = new_factors_sets[column].fillna(0) #去极值 for column in calc_columns: new_factors_sets['winsorize_' + column] = winsorize_normal(new_factors_sets[column].values.reshape(-1,1), num_stds=1).flatten() #行业风险中性化 for column in calc_columns: new_factors_sets['neutralize_' + column] = neutralize( new_factors_sets[self._risk_columns].values.astype(float), new_factors_sets['winsorize_' + column].values).flatten() #标准化 for column in calc_columns: new_factors_sets['standardize_' + column] = standardize( new_factors_sets['neutralize_' + column].values.reshape(-1,1)) #暂时以0处理 for column in calc_columns: new_factors_sets[column] = new_factors_sets[column].fillna(0) return new_factors_sets
def factor_processing(raw_factors: np.ndarray, pre_process: Optional[List] = None, risk_factors: Optional[np.ndarray] = None, post_process: Optional[List] = None, groups=None) -> np.ndarray: new_factors = raw_factors if pre_process: for p in pre_process: new_factors = p(new_factors, groups=groups) if risk_factors is not None: risk_factors = risk_factors[:, risk_factors.sum(axis=0) != 0] new_factors = neutralize(risk_factors, new_factors, groups=groups) if post_process: for p in post_process: if p.__name__ == 'winsorize_normal': alpha_logger.warning( "winsorize_normal " "normally should not be done after neutralize") new_factors = p(new_factors, groups=groups) return new_factors
def update_daily_portfolio(ds, **kwargs): execution_date = kwargs['next_execution_date'] if not isBizDay('china.sse', execution_date): logger.info("{0} is not a business day".format(execution_date)) return 0 prev_date = advanceDateByCalendar('china.sse', execution_date, '-1b') logger.info("factor data is loading for {0}".format(prev_date)) logger.info("Current running date is {0}".format(execution_date)) common_factors = ['EPSAfterNonRecurring', 'DivP'] prod_factors = ['CFinc1', 'BDTO', 'RVOL'] uqer_factors = ['CoppockCurve', 'EPS'] factor_weights = np.array([-1.0, 2.0]) factor_weights = factor_weights / factor_weights.sum() engine = sqlalchemy.create_engine('mysql+mysqldb://sa:[email protected]/multifactor?charset=utf8') engine2 = sqlalchemy.create_engine( 'mysql+pymysql://sa:[email protected]:3306/multifactor?charset=utf8') common_factors_df = pd.read_sql("select Code, 申万一级行业, {0} from factor_data where Date = '{1}'" .format(','.join(common_factors), prev_date), engine) prod_factors_df = pd.read_sql("select Code, {0} from prod_500 where Date = '{1}'" .format(','.join(prod_factors), prev_date), engine) uqer_factor_df = pd.read_sql( "select Code, {0} from factor_uqer where Date = '{1}'".format(','.join(uqer_factors), prev_date), engine2) risk_factor_df = pd.read_sql("select Code, {0} from risk_factor_500 where Date = '{1}'" .format(','.join(risk_factors_500), prev_date), engine) index_components_df = get_etf_index_weight.get_nffund_idx_etf_component(prev_date.strftime('%Y%m%d'), index='zz500') index_industry_weights = get_etf_index_weight.get_sw_industry_weight(index_components_df) index_components_df.rename(columns={'weight': 'benchmark'}, inplace=True) total_data = pd.merge(common_factors_df, uqer_factor_df, on=['Code']) total_data = pd.merge(total_data, risk_factor_df, on=['Code']) total_data = pd.merge(total_data, index_components_df, on=['Code']) total_data = total_data[total_data['benchmark'] != 0] null_flags = np.any(np.isnan(total_data[uqer_factors]), axis=1) total_data.fillna(0, inplace=True) total_factors = uqer_factors risk_factors_names = risk_factors_500 + ['Market'] total_data['Market'] = 1. all_factors = total_data[total_factors] risk_factors = total_data[risk_factors_names] factor_processed = neutralize(risk_factors.values, standardize(winsorize_normal(all_factors.values))) normed_factor = pd.DataFrame(factor_processed, columns=total_factors, index=[prev_date] * len(factor_processed)) er = normed_factor @ factor_weights # portfolio construction bm = total_data['benchmark'].values lbound = np.zeros(len(total_data)) ubound = 0.01 + bm risk_exposure = total_data[risk_factors_names].values ubound[null_flags] = 0. if len(bm) != 500: total_weight = index_industry_weights['weight'].sum() filtered = index_industry_weights[index_industry_weights.industry.isin(risk_factors_500)] ind_weights = filtered['weight'].values risk_lbound = np.concatenate([ind_weights / total_weight, [bm @ total_data['Size'].values / total_weight], [1.]], axis=0) risk_ubound = np.concatenate([ind_weights / total_weight, [bm @ total_data['Size'].values / total_weight], [1.]], axis=0) else: risk_lbound = bm @ risk_exposure risk_ubound = bm @ risk_exposure # get black list 1 engine = sqlalchemy.create_engine('mssql+pymssql://sa:[email protected]/WindDB') black_list = pd.read_sql("select S_INFO_WINDCODE, S_INFO_LISTDATE, sum(S_SHARE_RATIO) as s_ratio from ASHARECOMPRESTRICTED \ where S_INFO_LISTDATE BETWEEN '{0}' and '{1}' " \ "GROUP BY S_INFO_WINDCODE, S_INFO_LISTDATE ORDER BY s_ratio DESC;" .format((execution_date - dt.timedelta(days=7)).strftime('%Y%m%d'), (execution_date + dt.timedelta(days=14)).strftime('%Y%m%d')), engine) black_list = black_list[black_list['s_ratio'] >= 3.] black_list.S_INFO_WINDCODE = black_list.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0])) mask_array = total_data.Code.isin(black_list.S_INFO_WINDCODE) ubound[mask_array.values] = 0. # get black list 2 black_list2 = pd.read_sql("select S_INFO_WINDCODE, AVG(S_WQ_AMOUNT) as avg_amount from ASHAREWEEKLYYIELD " "where TRADE_DT < {1} and TRADE_DT >= {0} GROUP BY S_INFO_WINDCODE;" .format((execution_date - dt.timedelta(days=30)).strftime('%Y%m%d'), execution_date.strftime('%Y%m%d')), engine) black_list2 = black_list2[black_list2['avg_amount'] <= 15000.] black_list2.S_INFO_WINDCODE = black_list2.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0])) mask_array2 = total_data.Code.isin(black_list2.S_INFO_WINDCODE) ubound[mask_array2.values] = 0. # get black list 3 black_list3 = pd.read_sql("SELECT S_INFO_WINDCODE, S_DQ_SUSPENDDATE FROM ASHARETRADINGSUSPENSION AS a " "WHERE a.S_DQ_SUSPENDDATE = (SELECT top 1 S_DQ_SUSPENDDATE FROM ASHARETRADINGSUSPENSION AS b " "WHERE a.S_INFO_WINDCODE=b.S_INFO_WINDCODE and cast(floor(cast(b.OPDATE as float)) as datetime) <= '{0}' ORDER BY b.S_DQ_SUSPENDDATE DESC) " "AND a.S_INFO_WINDCODE IN (SELECT S_INFO_WINDCODE FROM ASHAREDESCRIPTION AS c " "WHERE c.S_INFO_DELISTDATE IS NULL) AND (a.S_DQ_SUSPENDDATE>='{1}' OR (a.S_DQ_RESUMPDATE IS NULL AND a.S_DQ_SUSPENDTYPE=444003000))" .format(execution_date, execution_date.strftime('%Y%m%d')), engine) black_list3.S_INFO_WINDCODE = black_list3.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0])) mask_array3 = total_data.Code.isin(black_list3.S_INFO_WINDCODE) ubound[mask_array3.values] = 0. # manual black list try: bk_list = pd.read_csv('~/mnt/sharespace/personal/licheng/portfolio/zz500_black_list/{0}.csv'.format( prev_date.strftime('%Y-%m-%d')), encoding='gbk', names=['code']) logger.info('Manual black list exists for the date: {0}'.format(prev_date.strftime('%Y-%m-%d'))) for code in bk_list['code']: ubound[total_data.Code == int(code)] = 0. except FileNotFoundError: logger.info('No manual black list exists for the date: {0}'.format(prev_date.strftime('%Y-%m-%d'))) weights = build_portfolio(er, builder='linear', risk_exposure=risk_exposure, lbound=lbound, ubound=ubound, risk_target=(risk_lbound, risk_ubound), solver='GLPK') portfolio = pd.DataFrame({'weight': weights, 'industry': total_data['申万一级行业'].values, 'zz500': total_data['benchmark'].values, 'er': er}, index=total_data.Code) client = pymongo.MongoClient('mongodb://10.63.6.176:27017') db = client.multifactor portfolio_collection = db.portfolio detail_info = {} for code, w, bm_w, ind, r in zip(total_data.Code.values, weights, total_data['benchmark'].values, total_data['申万一级行业'].values, er): detail_info[str(code)] = { 'weight': w, 'industry': ind, 'zz500': bm_w, 'er': r } portfolio_dict = {'Date': prev_date, 'portfolio': detail_info} portfolio_collection.delete_many({'Date': prev_date}) portfolio_collection.insert_one(portfolio_dict) portfolio.to_csv('~/mnt/sharespace/personal/licheng/portfolio/zz500/{0}.csv'.format(prev_date.strftime('%Y-%m-%d')), encoding='gbk') return 0
index_components, ref_date), engine) total_data = pd.merge(common_factors_df, prod_factors_df, on=['Date', 'Code']) total_data = pd.merge(total_data, risk_factor_df, on=['Date', 'Code']) total_data = pd.merge(total_data, index_components_df, on=['Date', 'Code']) total_data = total_data[total_data[index_components] != 0] total_data[index_components] = total_data[index_components] / 100.0 total_factors = common_factors + prod_factors risk_factors_names = risk_factors_500 + ['Market'] total_data['Market'] = 1. all_factors = total_data[total_factors] risk_factors = total_data[risk_factors_names] factor_processed = neutralize( risk_factors.values, standardize(winsorize_normal(all_factors.values))) normed_factor = pd.DataFrame(factor_processed, columns=total_factors, index=total_data.Date) er = normed_factor @ factor_weights # portfolio construction bm = total_data[index_components].values lbound = 0. ubound = 0.01 + bm lbound_exposure = -0.01 ubound_exposure = 0.01 risk_exposure = total_data[risk_factors_names].values