def process_data(total_data, factor_cols, risk_cols): risk_values = total_data[risk_cols].values factor_values = total_data[factor_cols].values processed_values = np.zeros(factor_values.shape) for i in range(processed_values.shape[1]): try: processed_values[:, i] = neutralize(risk_values, standardize(winsorize_normal(factor_values[:, [i]]))).flatten() except np.linalg.linalg.LinAlgError: processed_values[:, i] = neutralize(risk_values, winsorize_normal(factor_values[:, [i]])).flatten() return processed_values
def test_factor_processing(self): new_factor = factor_processing(self.raw_factor) np.testing.assert_array_almost_equal(new_factor, self.raw_factor) new_factor = factor_processing(self.raw_factor, pre_process=[standardize, winsorize_normal]) np.testing.assert_array_almost_equal(new_factor, winsorize_normal(standardize(self.raw_factor))) new_factor = factor_processing(self.raw_factor, pre_process=[standardize, winsorize_normal], risk_factors=self.risk_factor) np.testing.assert_array_almost_equal(new_factor, neutralize(self.risk_factor, winsorize_normal(standardize(self.raw_factor))))
def benchmark_winsorize_normal(n_samples: int, n_features: int, n_loops: int) -> None: print("-" * 60) print("Starting winsorize normal benchmarking") print("Parameters(n_samples: {0}, n_features: {1}, n_loops: {2})".format( n_samples, n_features, n_loops)) num_stds = 2 x = np.random.randn(n_samples, n_features) start = dt.datetime.now() for _ in range(n_loops): _ = winsorize_normal(x, num_stds) impl_model_time = dt.datetime.now() - start print('{0:20s}: {1}'.format('Implemented model', impl_model_time)) def impl(x): std_values = x.std(axis=0) mean_value = x.mean(axis=0) lower_bound = mean_value - num_stds * std_values upper_bound = mean_value + num_stds * std_values res = np.where(x > upper_bound, upper_bound, x) res = np.where(res < lower_bound, lower_bound, res) return res start = dt.datetime.now() for _ in range(n_loops): _ = impl(x) benchmark_model_time = dt.datetime.now() - start print('{0:20s}: {1}'.format('Benchmark model', benchmark_model_time))
def test_winsorize_normal_with_group_and_interp(self): cal_winsorized = winsorize_normal(self.x, self.num_stds, groups=self.groups, method='interp') def impl(x): x = x.values std_values = x.std(axis=0, ddof=1) mean_value = x.mean(axis=0) lower_bound = mean_value - self.num_stds * std_values upper_bound = mean_value + self.num_stds * std_values col_data = x.copy() idx = col_data > upper_bound u_values = col_data[idx] q_values = u_values.argsort().argsort() if len(q_values) > 0: col_data[idx] = upper_bound + q_values / len( q_values) * 0.5 * std_values idx = col_data < lower_bound l_values = col_data[idx] q_values = (-l_values).argsort().argsort() if len(q_values) > 0: col_data[idx] = lower_bound - q_values / len( q_values) * 0.5 * std_values return col_data exp_winsorized = pd.DataFrame(self.x).groupby( self.groups).transform(impl).values np.testing.assert_array_almost_equal(cal_winsorized, exp_winsorized)
def test_winsorize_normal_with_interp(self): calc_winsorized = winsorize_normal(self.x, self.num_stds, method='interp') std_values = self.x.std(axis=0, ddof=1) mean_value = self.x.mean(axis=0) lower_bound = mean_value - self.num_stds * std_values upper_bound = mean_value + self.num_stds * std_values for i in range(np.size(calc_winsorized, 1)): col_data = self.x[:, i].copy() idx = col_data > upper_bound[i] u_values = col_data[idx] q_values = u_values.argsort().argsort() if len(q_values) > 0: col_data[idx] = upper_bound[i] + q_values / len( q_values) * 0.5 * std_values[i] idx = col_data < lower_bound[i] l_values = col_data[idx] q_values = (-l_values).argsort().argsort() if len(q_values) > 0: col_data[idx] = lower_bound[i] - q_values / len( q_values) * 0.5 * std_values[i] calculated_col = calc_winsorized[:, i] np.testing.assert_array_almost_equal(col_data, calculated_col)
def test_winsorize_normal_with_group(self): cal_winsorized = winsorize_normal(self.x, self.num_stds, groups=self.groups) def impl(x): std_values = x.std(axis=0, ddof=1) mean_value = x.mean(axis=0) lower_bound = mean_value - self.num_stds * std_values upper_bound = mean_value + self.num_stds * std_values res = np.where(x > upper_bound, upper_bound, x) res = np.where(res < lower_bound, lower_bound, res) return res exp_winsorized = pd.DataFrame(self.x).groupby(self.groups).transform(impl).values np.testing.assert_array_almost_equal(cal_winsorized, exp_winsorized)
def test_winsorize_normal(self): calc_winsorized = winsorize_normal(self.x, self.num_stds) std_values = self.x.std(axis=0, ddof=1) mean_value = self.x.mean(axis=0) lower_bound = mean_value - self.num_stds * std_values upper_bound = mean_value + self.num_stds * std_values for i in range(np.size(calc_winsorized, 1)): col_data = self.x[:, i] col_data[col_data > upper_bound[i]] = upper_bound[i] col_data[col_data < lower_bound[i]] = lower_bound[i] calculated_col = calc_winsorized[:, i] np.testing.assert_array_almost_equal(col_data, calculated_col)
def on_factor_processing(self, new_factors_sets, columns = []): calc_columns = columns if len(columns) > 0 else self._columns ### 根据因子种类不同,做nan处理,基本面因子(成长,价值,质量)采用行业中值处理,其他以0处理, #暂时以0处理 for column in calc_columns: new_factors_sets[column] = new_factors_sets[column].fillna(0) #去极值 for column in calc_columns: new_factors_sets['winsorize_' + column] = winsorize_normal(new_factors_sets[column].values.reshape(-1,1), num_stds=1).flatten() #行业风险中性化 for column in calc_columns: new_factors_sets['neutralize_' + column] = neutralize( new_factors_sets[self._risk_columns].values.astype(float), new_factors_sets['winsorize_' + column].values).flatten() #标准化 for column in calc_columns: new_factors_sets['standardize_' + column] = standardize( new_factors_sets['neutralize_' + column].values.reshape(-1,1)) #暂时以0处理 for column in calc_columns: new_factors_sets[column] = new_factors_sets[column].fillna(0) return new_factors_sets
def update_daily_portfolio(ds, **kwargs): execution_date = kwargs['next_execution_date'] if not isBizDay('china.sse', execution_date): logger.info("{0} is not a business day".format(execution_date)) return 0 prev_date = advanceDateByCalendar('china.sse', execution_date, '-1b') logger.info("factor data is loading for {0}".format(prev_date)) logger.info("Current running date is {0}".format(execution_date)) common_factors = ['EPSAfterNonRecurring', 'DivP'] prod_factors = ['CFinc1', 'BDTO', 'RVOL'] uqer_factors = ['CoppockCurve', 'EPS'] factor_weights = np.array([-1.0, 2.0]) factor_weights = factor_weights / factor_weights.sum() engine = sqlalchemy.create_engine('mysql+mysqldb://sa:[email protected]/multifactor?charset=utf8') engine2 = sqlalchemy.create_engine( 'mysql+pymysql://sa:[email protected]:3306/multifactor?charset=utf8') common_factors_df = pd.read_sql("select Code, 申万一级行业, {0} from factor_data where Date = '{1}'" .format(','.join(common_factors), prev_date), engine) prod_factors_df = pd.read_sql("select Code, {0} from prod_500 where Date = '{1}'" .format(','.join(prod_factors), prev_date), engine) uqer_factor_df = pd.read_sql( "select Code, {0} from factor_uqer where Date = '{1}'".format(','.join(uqer_factors), prev_date), engine2) risk_factor_df = pd.read_sql("select Code, {0} from risk_factor_500 where Date = '{1}'" .format(','.join(risk_factors_500), prev_date), engine) index_components_df = get_etf_index_weight.get_nffund_idx_etf_component(prev_date.strftime('%Y%m%d'), index='zz500') index_industry_weights = get_etf_index_weight.get_sw_industry_weight(index_components_df) index_components_df.rename(columns={'weight': 'benchmark'}, inplace=True) total_data = pd.merge(common_factors_df, uqer_factor_df, on=['Code']) total_data = pd.merge(total_data, risk_factor_df, on=['Code']) total_data = pd.merge(total_data, index_components_df, on=['Code']) total_data = total_data[total_data['benchmark'] != 0] null_flags = np.any(np.isnan(total_data[uqer_factors]), axis=1) total_data.fillna(0, inplace=True) total_factors = uqer_factors risk_factors_names = risk_factors_500 + ['Market'] total_data['Market'] = 1. all_factors = total_data[total_factors] risk_factors = total_data[risk_factors_names] factor_processed = neutralize(risk_factors.values, standardize(winsorize_normal(all_factors.values))) normed_factor = pd.DataFrame(factor_processed, columns=total_factors, index=[prev_date] * len(factor_processed)) er = normed_factor @ factor_weights # portfolio construction bm = total_data['benchmark'].values lbound = np.zeros(len(total_data)) ubound = 0.01 + bm risk_exposure = total_data[risk_factors_names].values ubound[null_flags] = 0. if len(bm) != 500: total_weight = index_industry_weights['weight'].sum() filtered = index_industry_weights[index_industry_weights.industry.isin(risk_factors_500)] ind_weights = filtered['weight'].values risk_lbound = np.concatenate([ind_weights / total_weight, [bm @ total_data['Size'].values / total_weight], [1.]], axis=0) risk_ubound = np.concatenate([ind_weights / total_weight, [bm @ total_data['Size'].values / total_weight], [1.]], axis=0) else: risk_lbound = bm @ risk_exposure risk_ubound = bm @ risk_exposure # get black list 1 engine = sqlalchemy.create_engine('mssql+pymssql://sa:[email protected]/WindDB') black_list = pd.read_sql("select S_INFO_WINDCODE, S_INFO_LISTDATE, sum(S_SHARE_RATIO) as s_ratio from ASHARECOMPRESTRICTED \ where S_INFO_LISTDATE BETWEEN '{0}' and '{1}' " \ "GROUP BY S_INFO_WINDCODE, S_INFO_LISTDATE ORDER BY s_ratio DESC;" .format((execution_date - dt.timedelta(days=7)).strftime('%Y%m%d'), (execution_date + dt.timedelta(days=14)).strftime('%Y%m%d')), engine) black_list = black_list[black_list['s_ratio'] >= 3.] black_list.S_INFO_WINDCODE = black_list.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0])) mask_array = total_data.Code.isin(black_list.S_INFO_WINDCODE) ubound[mask_array.values] = 0. # get black list 2 black_list2 = pd.read_sql("select S_INFO_WINDCODE, AVG(S_WQ_AMOUNT) as avg_amount from ASHAREWEEKLYYIELD " "where TRADE_DT < {1} and TRADE_DT >= {0} GROUP BY S_INFO_WINDCODE;" .format((execution_date - dt.timedelta(days=30)).strftime('%Y%m%d'), execution_date.strftime('%Y%m%d')), engine) black_list2 = black_list2[black_list2['avg_amount'] <= 15000.] black_list2.S_INFO_WINDCODE = black_list2.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0])) mask_array2 = total_data.Code.isin(black_list2.S_INFO_WINDCODE) ubound[mask_array2.values] = 0. # get black list 3 black_list3 = pd.read_sql("SELECT S_INFO_WINDCODE, S_DQ_SUSPENDDATE FROM ASHARETRADINGSUSPENSION AS a " "WHERE a.S_DQ_SUSPENDDATE = (SELECT top 1 S_DQ_SUSPENDDATE FROM ASHARETRADINGSUSPENSION AS b " "WHERE a.S_INFO_WINDCODE=b.S_INFO_WINDCODE and cast(floor(cast(b.OPDATE as float)) as datetime) <= '{0}' ORDER BY b.S_DQ_SUSPENDDATE DESC) " "AND a.S_INFO_WINDCODE IN (SELECT S_INFO_WINDCODE FROM ASHAREDESCRIPTION AS c " "WHERE c.S_INFO_DELISTDATE IS NULL) AND (a.S_DQ_SUSPENDDATE>='{1}' OR (a.S_DQ_RESUMPDATE IS NULL AND a.S_DQ_SUSPENDTYPE=444003000))" .format(execution_date, execution_date.strftime('%Y%m%d')), engine) black_list3.S_INFO_WINDCODE = black_list3.S_INFO_WINDCODE.str.split('.').apply(lambda x: int(x[0])) mask_array3 = total_data.Code.isin(black_list3.S_INFO_WINDCODE) ubound[mask_array3.values] = 0. # manual black list try: bk_list = pd.read_csv('~/mnt/sharespace/personal/licheng/portfolio/zz500_black_list/{0}.csv'.format( prev_date.strftime('%Y-%m-%d')), encoding='gbk', names=['code']) logger.info('Manual black list exists for the date: {0}'.format(prev_date.strftime('%Y-%m-%d'))) for code in bk_list['code']: ubound[total_data.Code == int(code)] = 0. except FileNotFoundError: logger.info('No manual black list exists for the date: {0}'.format(prev_date.strftime('%Y-%m-%d'))) weights = build_portfolio(er, builder='linear', risk_exposure=risk_exposure, lbound=lbound, ubound=ubound, risk_target=(risk_lbound, risk_ubound), solver='GLPK') portfolio = pd.DataFrame({'weight': weights, 'industry': total_data['申万一级行业'].values, 'zz500': total_data['benchmark'].values, 'er': er}, index=total_data.Code) client = pymongo.MongoClient('mongodb://10.63.6.176:27017') db = client.multifactor portfolio_collection = db.portfolio detail_info = {} for code, w, bm_w, ind, r in zip(total_data.Code.values, weights, total_data['benchmark'].values, total_data['申万一级行业'].values, er): detail_info[str(code)] = { 'weight': w, 'industry': ind, 'zz500': bm_w, 'er': r } portfolio_dict = {'Date': prev_date, 'portfolio': detail_info} portfolio_collection.delete_many({'Date': prev_date}) portfolio_collection.insert_one(portfolio_dict) portfolio.to_csv('~/mnt/sharespace/personal/licheng/portfolio/zz500/{0}.csv'.format(prev_date.strftime('%Y-%m-%d')), encoding='gbk') return 0
total_data = pd.merge(common_factors_df, prod_factors_df, on=['Date', 'Code']) total_data = pd.merge(total_data, risk_factor_df, on=['Date', 'Code']) total_data = pd.merge(total_data, index_components_df, on=['Date', 'Code']) total_data = total_data[total_data[index_components] != 0] total_data[index_components] = total_data[index_components] / 100.0 total_factors = common_factors + prod_factors risk_factors_names = risk_factors_500 + ['Market'] total_data['Market'] = 1. all_factors = total_data[total_factors] risk_factors = total_data[risk_factors_names] factor_processed = neutralize( risk_factors.values, standardize(winsorize_normal(all_factors.values))) normed_factor = pd.DataFrame(factor_processed, columns=total_factors, index=total_data.Date) er = normed_factor @ factor_weights # portfolio construction bm = total_data[index_components].values lbound = 0. ubound = 0.01 + bm lbound_exposure = -0.01 ubound_exposure = 0.01 risk_exposure = total_data[risk_factors_names].values