def update_financial_data(self): data = Data() roattm = data.roattm latest_date = roattm.columns[-1] needed = 1 if latest_date.month == 3: if datetime.today().month <= 7: needed = 0 elif latest_date.month == 6: if datetime.today().month <= 10: needed = 0 elif latest_date.month == 9: if datetime.today().month <= 4: needed = 0 elif latest_date.month == 12: raise ValueError if needed == 1: try: update_all_basic() except Exception as e: print('暂时链接不上聚源,再试') else: print('无需更新财务数据')
def month_return_compare_to_market_index(stock_list, his_month): data = Data() changePCT = data.changepct_monthly ff = None for c in changePCT.columns: if c.year == his_month.year and c.month == his_month.month: ff = c break res2 = changePCT.loc[stock_list, ff] index_path = r'D:\pythoncode\IndexEnhancement\指数相关' index_price = pd.read_csv(os.path.join(index_path, 'index_price_monthly.csv'), engine='python') index_price = index_price.set_index(index_price.columns[0]) index_price.index = pd.to_datetime(index_price.index) index_r = (index_price - index_price.shift(1)) / index_price.shift(1) fff = None for c in index_r.index: if c.year == his_month.year and c.month == his_month.month: fff = c break res1 = pd.DataFrame({fff: index_r.loc[fff, :].drop_duplicates()}) tt = pd.DataFrame(data=res2.mean(), index=['组合'], columns=[ff]) res1 = pd.concat([res1, tt], axis=0) return res1, res2
def name_to_code(names_list): data = Data() all_stocks_code = data.stock_basic_inform all_stocks_code = all_stocks_code[['sec_name'.upper()]] all_stocks_code['code'.upper()] = all_stocks_code.index all_stocks_code = all_stocks_code.set_index('sec_name'.upper()) res = list(all_stocks_code.loc[names_list, 'code'.upper()]) return res
def load_benchmark(self, benchmark_str): data = Data() if self.freq == 'M': if benchmark_str in ['WindA', 'HS300', 'SH50', 'ZZ500']: price_monthly = data.index_price_monthly self.benchmark_p = price_monthly.loc[benchmark_str, :] self.benchmark_r = self.benchmark_p / self.benchmark_p.shift(1) - 1 else: price_monthly = data.industry_price_monthly self.benchmark_p = price_monthly.loc[benchmark_str + '(申万)', :] self.benchmark_r = self.benchmark_p / self.benchmark_p.shift(1) - 1
def del_industry(stock_pool, to_del_indus): data = Data() stock_basic = data.stock_basic_inform sw_1 = stock_basic[['申万一级行业']] for col, v in stock_pool.iteritems(): for ind in to_del_indus: si = [i for i in sw_1[sw_1[sw_1.columns[0]] == ind].index if i in stock_pool.index] v[si] = False return stock_pool
def deal_marco_data(): data = Data() month_macro_data = data.month_macro_data_raw daily_macro_data = data.daily_macro_data_raw d_m = deal_daily_macro_data(daily_macro_data.T) dm0 = naturedate_to_tradeddate(d_m, tar='index') m_m = deal_month_macro_data(month_macro_data.T) dm1 = naturedate_to_tradeddate(m_m, tar='index') macro_dat = pd.concat([dm0, dm1], axis=1) smoothed_macro = macro_data_smooth_process(macro_dat) return smoothed_macro
def form_stock2_second_indus(panel_path, save_path): ''' :param panel_path: 月度数据的存储地址 :param save_path: 目标文件的存储地址 :return: ''' # 把股票的月度数据转换为行业的形式 data = Data() indus_infor = data.secondindustryname indus_infor = data.reindex(indus_infor) dirlist = os.listdir(panel_path) indux_wei_total = pd.DataFrame() for f in dirlist: stock_wei = pd.read_csv(os.path.join(panel_path, f), encoding='gbk', engine='python') stock_wei = stock_wei.set_index('wind_code') if f.split('.')[0] in indus_infor.columns: stock_wei['second_indus'] = indus_infor[f.split('.')[0]] else: stock_wei['second_indus'] = indus_infor[indus_infor.columns[-1]] stock_wei = stock_wei.dropna(axis=0, how='any') stock_wei['i_weight'] = 100 * stock_wei['i_weight'] / np.sum( stock_wei['i_weight']) grouped = stock_wei[['i_weight', 'second_indus']].groupby('second_indus') indus_wei = grouped.sum() indus_wei = indus_wei.T indus_wei.index = [f.split('.')[0]] indux_wei_total = pd.concat([indux_wei_total, indus_wei], axis=0) indux_wei_total = indux_wei_total.fillna(0) indux_wei_total.to_csv(os.path.join(save_path, '二级行业权重.csv'), encoding='gbk')
def update_f_data_from_wind(): path = r'D:\pythoncode\IndexEnhancement\barra_cne6\download_from_juyuan' w.start() data = Data() stock_basic_inform = data.stock_basic_inform # m_list = w.tdays(tds, eds, "Days=Alldays;Period=Q", usedf=True) # m_list = m_list[1] # m_list = list(m_list[m_list.columns[0]]) iterms = [ 'rd_exp', # 研发费用 ] codes_str = '' for i in stock_basic_inform.index: codes_str = codes_str + ',' + i codes_str = codes_str[1:] eds = datetime.today().strftime("%Y-%m-%d") for it in iterms: try: tmp_df = eval('data.' + it) tds = tmp_df.columns[-1] except Exception as e: tmp_df = pd.DataFrame() tds = datetime(2009, 1, 1) if (datetime.today() - tds).days > 110: res_tmp = w.wsd(codes_str, it, tds.strftime("%Y-%m-%d"), eds, "unit=1;rptType=1;Period=Q;Days=Alldays", usedf=True) res_tmp1 = res_tmp[1] res_tmp1 = res_tmp1.T tmp_df = pd.concat([tmp_df, res_tmp1], axis=1) data.save(tmp_df, it, save_path=path)
def rps_factor(rps_min=50, rps_max=100): data = Data() rps = data.RPS rps.fillna(0, inplace=True) rps_cond = float_2_bool_df(rps, min_para=rps_min, max_para=rps_max) # 用于回测,向右移动一期 rps = rps.shift(1, axis=1) rps_cond = rps_cond.shift(1, axis=1) rps.dropna(axis=1, how='all', inplace=True) rps_cond.dropna(axis=1, how='all', inplace=True) return rps_cond
def get_indus_wt_in_index(index, indus_level='second'): index_wt = get_stock_wt_in_index(index) data = Data() if indus_level == 'first': industry = data.firstindustryname elif indus_level == 'second': industry = data.secondindustryname industry = data.reindex(industry) industry = industry.loc[index_wt.index, index_wt.columns] indus_wt = pd.DataFrame() for d in index_wt.columns: # d = index_wt.columns[0] tmp_df = pd.concat([index_wt.loc[:, d], industry.loc[:, d]], axis=1) tmp_df = tmp_df.dropna() tmp_df.columns = ['wt', 'industry'] indus_wt_tmp = tmp_df['wt'].groupby(tmp_df['industry']).sum() indus_wt_df = pd.DataFrame(indus_wt_tmp.values, index=indus_wt_tmp.index, columns=[d]) indus_wt = pd.concat([indus_wt, indus_wt_df], axis=1) indus_wt = indus_wt.fillna(0) return indus_wt
def from_stock_wei_2_industry_wei(wei_df): data = Data() all_stocks_code = data.stock_basic_inform all_stocks_code = all_stocks_code[['申万一级行业']] wei_df = wei_df.fillna(0) res = pd.DataFrame() for col, se in wei_df.iteritems(): tmp_df = pd.DataFrame({col: se}) to_group = pd.concat([tmp_df, all_stocks_code], axis=1) to_group.fillna(0, inplace=True) grouped = to_group.groupby('申万一级行业').sum() if 0 in grouped.index: grouped.drop(0, axis=0, inplace=True) res = pd.concat([res, grouped], axis=1) return res
def add_industry_infor(): # 存储临时数据地址 tmp_save_path = r'D:\pythoncode\IndexEnhancement\barra_cne6\tmp' if not os.path.exists(tmp_save_path): os.makedirs(tmp_save_path) # 获取数据 data = Data() second_ind = data.industry_sw_2 # pd.DataFrame(set(second_ind[second_ind.columns[0]].values)).to_csv(r'D:\pythoncode\IndexEnhancement\二级行业名称.csv', # encoding='gbk') # 输入的类型为Series inps = second_ind[second_ind.columns[0]] # 与原月度数据合并 target_date_path = r'D:\pythoncode\IndexEnhancement\因子预处理模块\因子' add_to_panels(inps, target_date_path, 'second_industry')
def easy_bt(wei_stocks, basic_return_infor): data = Data() changepct_daily = data.CHANGEPECT_OPEN_DAILY changepct_daily = changepct_daily.shift(-1, axis=1) changepct_daily.dropna(how='all', axis=1, inplace=True) changepct_daily = changepct_daily / 100 wei_stocks, changepct_daily = align(wei_stocks, changepct_daily) # fee_type='No_fee' 不计算佣金和印花税, 'fee_1'计算佣金和印花税,不计算冲击成本 daily_return, net_value = back_test(changepct_daily, wei_stocks, fee_type='fee_1') # plt.plot(net_cpd) # 若有基准日度收益率,则计算累计超额收益率 if isinstance(basic_return_infor, str): # 有基准收益,算超额收益 basic_return = pd.read_csv(basic_return_infor, engine='python') basic_return = basic_return.set_index('date') if 'daily_return' in basic_return.columns: daily_excess_r = daily_return['daily_return'] - basic_return['daily_return'] # 若没有日度收益数据,则根据日度净值数据计算出日度收益收益数据 elif 'daily_return' not in basic_return.columns and 'net_value' in basic_return.columns: basic_return['daily_return'] = basic_return['net_value']/basic_return['net_value'].shift(1) - 1 daily_excess_r = daily_return['daily_return'] - basic_return['daily_return'] daily_excess_r.dropna(inplace=True) daily_excess_cum = (daily_excess_r + 1).cumprod() cum_excess_df = pd.DataFrame({'cum_excess_ret': daily_excess_cum}) elif isinstance(basic_return_infor, pd.DataFrame): if 'daily_return' not in basic_return_infor.columns and 'net_value' in basic_return_infor.columns: basic_return_infor['daily_return'] = basic_return_infor['net_value'] / \ basic_return_infor['net_value'].shift(1) - 1 daily_excess_r = daily_return['daily_return'] - basic_return_infor['daily_return'] daily_excess_r.dropna(inplace=True) daily_excess_cum = (daily_excess_r + 1).cumprod() cum_excess_df = pd.DataFrame({'cum_excess_ret': daily_excess_cum}) else: cum_excess_df = None return daily_return, net_value, cum_excess_df
def new_record_stock(stock_wei, save_name='每期', save_path=None): data = Data() all_stocks_code = data.all_stocks_code all_stocks_code = all_stocks_code[['wind_code', 'sec_name']] all_stocks_code = all_stocks_code.set_index('wind_code') maxl = np.sum(stock_wei != 0.0, axis=0).max() res_to_csv = pd.DataFrame(index=range(0, maxl)) for col, items in stock_wei.iteritems(): selected = items[items.index[items != 0]] selected = all_stocks_code.loc[selected.index, :] selected = pd.DataFrame(selected.values, columns=[col]) res_to_csv = pd.concat([res_to_csv, selected], axis=1) if not save_path: save_path = r'D:\pythoncode\IndexEnhancement' res_to_csv.to_csv(os.path.join(save_path, save_name + '选股结果.csv'), encoding='gbk') return res_to_csv
def select_import_wei(wei_se, n_max, abs_max_ratio=0.9, max_in_indus=2): ''' abs_max = 15 留下绝对数前15的个股权重 max_in_indus = 2 留下行业内排名前2的个股权重 同时删除部分权重过小的股票,只留下极少部分股票做进一步的优化,删除第一部优化时权重小于0.001的股票。 ''' data = Data() basic_inform = data.stock_basic_inform indus_map = basic_inform.loc[wei_se.index, '申万一级行业'] # 强制保留w1中各行业最大权重股以及其他权重前15的股票 res_wei = pd.Series(index=wei_se.index) # 分行业里面权重大的留下 wei_se = wei_se.sort_index() dat_df = pd.DataFrame({'wei': wei_se, 'industry': indus_map}) grouped = dat_df.groupby('industry') for ind, v in grouped: tmp = v['wei'].sort_values(ascending=False) res_wei[tmp[:max_in_indus].index] = tmp[:max_in_indus] num_0 = np.sum(res_wei > 0) # 分行业留下权重前2的股票之外,在找到其余的股票中绝对权重排名前 (n_max * abs_max_ratio - 已经留下的股票数量)的股票数量。 num_1 = int(n_max * abs_max_ratio) - num_0 tmp = list(set(wei_se.index) - set(res_wei.dropna().index)) left_se = wei_se[tmp] left_se = left_se.sort_values(ascending=False) tmp1 = left_se[:num_1] res_wei[tmp1.index] = tmp1 res_wei = res_wei.dropna() n_left = np.sum(res_wei > 0) # np.sum(res_wei) if n_left > n_max: print('留下的股票过多,重新选择') input("暂时挂起.... ") tmp2 = wei_se[wei_se > 0.001] tobe_opt = [i for i in tmp2.index if i not in res_wei.index] return res_wei, n_left, tobe_opt
def keep_industry(stock_pool, to_keep_indus_list, industry_type='sw'): data = Data() stock_basic = data.stock_basic_inform if industry_type == 'sw': industry_df = stock_basic[['申万一级行业']] elif industry_type == 'zx': industry_df = stock_basic[['中信一级行业']] for col, v in stock_pool.iteritems(): for ind in to_keep_indus_list: # 选出该行业的股票 tmp = industry_df[industry_df[industry_df.columns[0]] == ind].index # 也在stock_pool的index里面 si = [i for i in tmp if i in stock_pool.index] # 其余行业的股票 se = list(set(stock_pool.index).difference(set(si))) # 均为否 v[se] = False return stock_pool
def section_stock_num(industry_name): data = Data() sw_1 = data.industry_sw_1 stock_codes = list(sw_1.index[sw_1[sw_1.columns[0]] == industry_name]) res_i = [] res_v = [] fn_list = os.listdir(origin_factor_path) for i in fn_list: dat = pd.read_csv(os.path.join(origin_factor_path, i), encoding='gbk') dat.set_index('code', inplace=True) new_index = [c for c in stock_codes if c in dat.index] dat = dat.loc[new_index, :] res_i.append(pd.to_datetime(i.split('.')[0])) res_v.append(len(dat)) res = pd.Series(res_v, index=res_i) return res
def update_industry_data(): w.start() data = Data() index_path = r'D:\pythoncode\IndexEnhancement\指数相关' try: indus_p = data.industry_price_monthly st = indus_p.index[-1] - timedelta(90) except Exception as e: indus_p = pd.DataFrame() st = datetime(2006, 1, 1) ed = datetime.today() - timedelta(1) targets_str = '' for key in code_name_map_sw.keys(): targets_str = targets_str + ',' + key targets_str = targets_str.lstrip(',') res = w.wsd(targets_str, "close", st.strftime("%Y-%m-%d"), ed.strftime("%Y-%m-%d"), "Period=M", usedf=True) res = res[1] res.index = pd.to_datetime(res.index) res = res.rename(code_name_map_sw, axis=1) if indus_p.empty: res.to_csv(os.path.join(index_path, 'industry_price_monthly.csv'), encoding='gbk') else: to_deal_index = [i for i in indus_p if i in res.index] indus_p.drop(to_deal_index, axis=0, inplace=True) indus_p = pd.concat([indus_p, res], axis=0) indus_p.to_csv(os.path.join(index_path, 'industry_price_monthly.csv'), encoding='gbk')
def select_stocks_by_scores(stock_pool, factors, factor_weight, reversed_factors, icir_e, max_num=100, each_industry_num=5, select_type='by_industry', wei_type='equal'): data = Data() industry_sw = data.industry_sw_1 new_stock_pool = pd.DataFrame() for col, value in stock_pool.iteritems(): # if value.sum() > max_num: codes = list(value[value == True].index) codes_selected = select_stocks_by_scores_singal_section(codes, col, factors, factor_weight, reversed_factors, industry_sw, max_num, icir_e, each_industry_num, select_type=select_type, wei_type=wei_type) # else: # codes_selected = list(value[value==True].index) if codes_selected: tmp = pd.DataFrame(np.full(len(codes_selected), True), index=codes_selected, columns=[col]) new_stock_pool = pd.concat([new_stock_pool, tmp], axis=1) new_stock_pool.fillna(False, inplace=True) return new_stock_pool
def linear_programming(data_dict, industry_neutralized=False, mv_neutralized=False, equal_weighted=False): """ 线性规划法-求解最优组合权重 """ est_stock_rets, limit_fac_data, index_wt = data_dict['est_stock_rets'], \ data_dict['limit_fac_data'], data_dict['index_wt'] stock_wt = pd.DataFrame() data = Data() basic = data.stock_basic_inform industry_sw = basic[['申万一级行业']] for date in est_stock_rets.columns: est_rets = est_stock_rets[[date]].dropna() est_rets.columns = ['rets'] limit_fac_panel = limit_fac_data[date].dropna() benchmark_wt = index_wt[[date]].dropna() benchmark_wt.columns = ['benchmark_wt'] cur_wt = lp_solve(date, est_rets, limit_fac_panel, benchmark_wt, industry_sw) cur_wt.name = date stock_wt = pd.concat([stock_wt, cur_wt], axis=1) stock_wt = stock_wt.where(stock_wt != 0, np.nan) return stock_wt
def update_index_wei(): w.start() data = Data() zz500_wt = data.zz500_wt hs300_wt = data.hs300_wt mes = generate_months_ends() # 先删除一些不是月末的数据 to_del = [c for c in zz500_wt.columns if c not in mes] if len(to_del) > 0: zz500_wt = zz500_wt.drop(to_del, axis=1) to_del = [c for c in hs300_wt.columns if c not in mes] if len(to_del) > 0: hs300_wt = hs300_wt.drop(to_del, axis=1) new_mes = [m for m in mes if m > zz500_wt.columns[-1]] for m in new_mes: m_str = m.strftime("%Y-%m-%d") # 沪深300 res = w.wset("indexconstituent", "date=" + m_str + ";windcode=000300.SH", usedf=True) res = res[1] res.set_index('wind_code', inplace=True) to_add = pd.DataFrame({m: res['i_weight']}) hs300_wt = pd.concat([hs300_wt, to_add], axis=1) # 中证500 res = w.wset("indexconstituent", "date=" + m_str + ";windcode=000905.SH", usedf=True) res = res[1] res.set_index('wind_code', inplace=True) to_add = pd.DataFrame({m: res['i_weight']}) zz500_wt = pd.concat([zz500_wt, to_add], axis=1) data.save(hs300_wt, 'hs300_wt', save_path=r'D:\pythoncode\IndexEnhancement\指数相关') data.save(zz500_wt, 'zz500_wt', save_path=r'D:\pythoncode\IndexEnhancement\指数相关')
def signal_factor_pool(factor_range, indus_dict, factor_name, top_or_bottom, per): # factor_path = r'D:\pythoncode\IndexEnhancement\因子预处理模块\因子(已预处理)' if factor_range == 'one_industry': data = Data() sw_1 = data.industry_sw_1 factor_path = r'D:\pythoncode\IndexEnhancement\因子预处理模块\因子' f_list = os.listdir(factor_path) res_df = pd.DataFrame() for f in f_list: data = pd.read_csv(os.path.join(factor_path, f), engine='python', encoding='gbk') if factor_name not in data.columns: print('在{}数据中未找到{}因子'.format(f, factor_name)) data = data[['code', 'name', factor_name]] data = data.set_index('code') data.dropna(axis=0, how='any', inplace=True) if factor_range == 'one_industry': se = list(sw_1.index[sw_1[sw_1.columns[0]] == indus_dict['to_handle_indus'][0]]) se_code = [i for i in se if i in data.index] data = data.loc[se_code, :] le = int(len(data) * per) sorted_df = data.sort_values(by=factor_name, ascending=False) if top_or_bottom == 'top': r = sorted_df.index[:le - 1] elif top_or_bottom == 'bottom': r = sorted_df.index[-le - 1:] tmp_df = pd.DataFrame([True for i in range(0, len(r))], index=r.values, columns=[datetime.strptime(f.split('.')[0], "%Y-%m-%d")]) res_df = pd.concat([res_df, tmp_df], axis=1) res_df.fillna(False, inplace=True) return res_df
def financial_condition_pool(selection_dict, start_date, end_date): data = Data() stock_basic = data.stock_basic_inform firstindustry = stock_basic[['中信一级行业']] all_stocks_code = stock_basic[['sec_name'.upper(), 'ipo_date'.upper()]] # roe roettm = data.roettm # 净利润同比增速 netprofitgrowrate = data.netprofitgrowrate # 基本每股收益同比增长率 basicepsyoy = data.basicepsyoy # 销售毛利率 grossincome = data.grossincomeratiottm # 资产负债率 debtassetsratio = data.debtassetsratio # 估值 pe = data.pe cond_total = pd.DataFrame() for plate_name, conditions_dict in selection_dict.items(): pe_con = None con_plate = None if plate_name == 'all': codes_in_industry = list(firstindustry.index) else: codes_in_industry = [ind for ind in firstindustry.index if firstindustry[ind] == plate_name] for conditions_type, tuples in conditions_dict.items(): if conditions_type.split('_')[0] == 'scope': myfun = 'scopy_condition' for t in tuples: if t == 'pe': has_pe = 1 pe_con = copy.deepcopy(t) continue res = eval('select_stocks(' + tuples[0] + ', codes_in_industry, start_date, end_date)') res_con = eval(myfun + '(res, minV=' + str(tuples[1]) + ', maxV= ' + str(tuples[2]) + ')') if isinstance(con_plate, pd.DataFrame): con_plate = con_plate & res_con else: con_plate = res_con elif conditions_type.split('_')[0] == 'rise': myfun = 'rise_condition' # for t in tuples: res = eval('select_stocks(' + tuples[0] + ', codes_in_industry, start_date, end_date)') res_con = eval(myfun + '(res,' + str(tuples[1]) + ')') if isinstance(con_plate, pd.DataFrame): con_plate = con_plate & res_con else: con_plate = res_con # 不同行业之间合并 cond_total = pd.concat([cond_total, con_plate], axis=1) # 剔除上市未满N年得股票,N = 1 # N = 1 # for col, items in cond_total.iteritems(): # for i in items.index: # if items[i]: # if i in all_stocks_code.index: # de = col - all_stocks_code.loc[i, 'ipo_date'] # if de.days < N * 365: # items[i] = False # else: # # i 不在all_stocks_code里面,是因为all_stocks_code没有更新,说明该股票是最近上市的股票, # # 直接全部复制为False # items[i] = False # 剔除st for col, items in cond_total.iteritems(): for i in items.index: if i in all_stocks_code.index and 'ST' in all_stocks_code.loc[i, 'sec_name'.upper()]: items[i] = False # 调整为公告日期 cond_total = adjust_months(cond_total) # 用来扩展月度数据 cond_total = append_df(cond_total) return cond_total
def code_to_name(code_list): data = Data() all_stocks_code = data.stock_basic_inform all_stocks_code = all_stocks_code[['sec_name'.upper()]] res = list(all_stocks_code.loc[code_list, 'sec_name'.upper()]) return res
def optimization_fun(ret, e, bench_wei, pre_w=None, is_enhance=True, lamda=10, c=0.015, turnover=None, te=None, industry_max_expose=0, risk_factor_dict={}, limit_factor_df=None, in_benchmark=True, in_benchmark_wei=0.8, max_num=None): if in_benchmark: # 如果必须在成份股内选择,则需要对风险矩阵进行处理,跳出仅是成份股的子矩阵 wei_tmp = bench_wei.dropna() bug_maybe = [i for i in wei_tmp.index if i not in e.index] if len(bug_maybe) > 0: print('存在下列股票不在组合里,请检查') print(bug_maybe) e_tmp = e.loc[wei_tmp.index, wei_tmp.index].fillna(0) ret_tmp = ret[wei_tmp.index].fillna(0) if pre_w: pre_w = pre_w[wei_tmp.index].fillna(0) else: # 确保几个重要变量有相同的index n_index = [i for i in e.index if i in ret.index] e_tmp = e.loc[n_index, n_index] ret_tmp = ret[n_index] wei_tmp = bench_wei[n_index].fillna(0) if isinstance(pre_w, pd.Series): to_test_list = len([i for i in pre_w.index if i not in n_index]) if np.any(pre_w[to_test_list] > 0.001): input('input:存在部分有权重的股票在上期,而不再当期的数据中,请检查') pre_w = pre_w[n_index].fillna(0) # 如果可以选非成份股,则可以确定一个成份股权重比例的约束条件。 is_in_bench = deepcopy(wei_tmp) is_in_bench[is_in_bench > 0] = 1 # 代表是否在成份股内的变量 data = Data() basic = data.stock_basic_inform industry_sw = basic[['申万一级行业']] # 股票组合的行业虚拟变量 industry_map = industry_sw.loc[ret_tmp.index, :] # dummies_bench = pd.get_dummies(industry_map.loc[bench_wei.index, :]) # dummies_bench.sum() 不同行业的公司数量 industry_map.fillna('综合', inplace=True) dummies = pd.get_dummies(industry_map[industry_map.columns[0]]) dummies.sum() # 个股最大权重为行业权重的 3/4 ind_wei = np.dot(dummies.T, wei_tmp) ind_wei_se = pd.Series(index=dummies.columns, data=ind_wei) industry_map['max_wei'] = None for i in industry_map.index: try: industry_map.loc[ i, 'max_wei'] = 0.75 * ind_wei_se[industry_map.loc[i, '申万一级行业']] except Exception as e: industry_map.loc[i, 'max_wei'] = 0.02 max_wei = industry_map['max_wei'].values x = cp.Variable(len(ret_tmp), nonneg=True) q = ret_tmp.values P = lamda * e_tmp.values ind_wei = np.dot(dummies.T, wei_tmp) # b.shape ind_wei_su = pd.Series(ind_wei, index=dummies.columns) dum = dummies.T.values # A.shape para_dict = { 'x': x, 'max_wei': max_wei, 'in_benchmark_wei': in_benchmark_wei, 'is_in_bench': is_in_bench, 'ret_e': ret_tmp, 'dum': dum, 'wei_tmp': wei_tmp, 'ind_wei': ind_wei, 'risk_factor_dict': risk_factor_dict, 'limit_factor_df': limit_factor_df, 'pre_w': pre_w, 'P': P, 'total_wei': 1, } con_dict = { 'in_benchmark': in_benchmark, 'industry_max_expose': industry_max_expose, 'turnover': turnover, 'te': te, } constraints = generates_constraints(para_dict, con_dict) prob = generates_problem(q, x, P, c, pre_w, constraints, te) print('开始优化...') time_start = time.time() prob.solve() status = prob.status # 如果初始条件无解,需要放松风险因子的约束 iters = 0 while status != 'optimal' and iters < 3: if len(risk_factor_dict) > 0 and iters == 0: tmp_d = deepcopy(risk_factor_dict) for k, v in tmp_d.items(): tmp_d[k] = v + 0.5 para_dict['risk_factor_dict'] = tmp_d elif not turnover and iters == 1: turnover = turnover + 0.2 con_dict['turnover'] = turnover elif iters == 2: industry_max_expose = industry_max_expose + 0.05 con_dict['industry_max_expose'] = industry_max_expose iters = iters + 1 constraints = generates_constraints(para_dict, con_dict) prob = generates_problem(q, x, P, c, pre_w, constraints, te) print('第{}次优化'.format(iters)) prob.solve() status = prob.status time_end = time.time() print('优化结束,用时', time_end - time_start) print('优化结果为{}'.format(status)) # if prob.status != 'optimal': # input('input:未得出最优解,请检查') # np.sum(x.value) # np.sum(x.value > 0.0) # np.sum(x.value > 0.001) # np.sum(x.value[x.value > 0.001]) # np.sum(x.value[x.value < 0.001]) # 返回值 wei_ar = np.array(x.value).flatten() # wei_ar.size wei_se = pd.Series(wei_ar, index=ret_tmp.index) # 设定标准,一般情况下无需对股票数量做二次优化,只有股票数量过多是才需要。 if np.sum(x.value > 0.001) > max_num: print('进行第二轮股票数量的优化') # wei_selected, n2, tobe_opt = select_import_wei(wei_se, max_num) tobe_opt = list(wei_se[wei_se > 0.001].index) print('第二次优化为从{}支股票中优化选择出{}支'.format(len(tobe_opt), max_num)) # 经过处理后,需要优化的计算量大幅度减少。比如第一次优化后,权重大于0.001的股票数量是135,超过最大要求的100。 # 我们首先保留其中前90,然后从后面的45个中选择10保留下来。 len(tobe_opt) e_tmp2 = e_tmp.loc[tobe_opt, tobe_opt] ret_tmp2 = ret_tmp[tobe_opt] # wei_tmp2 = wei_tmp[tobe_opt] is_in_bench2 = is_in_bench[tobe_opt] dummies2 = pd.get_dummies(industry_map.loc[tobe_opt, industry_map.columns[0]]) dum2 = dummies2.T.values # 小坑 new_ind = ind_wei_su[dummies2.columns] new_ind = new_ind / new_ind.sum() ind_wei2 = new_ind.values # 对个股权重优化的坑,开始时是行业权重乘以0.75,但在二次优化的时候,可能有的行情的权重不够用了。 max_wei2 = 3 * industry_map.loc[tobe_opt, 'max_wei'].values total_wei = 1 if pre_w: pre_w = pre_w[tobe_opt] P2 = lamda * e_tmp2.values # 有些行业个股权重以前的不够了 x = cp.Variable(len(ret_tmp2), nonneg=True) y = cp.Variable(len(ret_tmp2), boolean=True) para_dict2 = { 'x': x, 'y': y, 'y_sum': max_num, # - n2, 'max_wei': max_wei2, # max_wei2.max() max_wei2.sum() 'in_benchmark_wei': in_benchmark_wei, 'is_in_bench': is_in_bench2, 'ret_e': ret_tmp2, 'dum': dum2, 'wei_tmp': wei_tmp, 'ind_wei': ind_wei2, # ind_wei2.sum() 'risk_factor_dict': risk_factor_dict, 'limit_factor_df': limit_factor_df, 'pre_w': pre_w, 'P': P, 'total_wei': total_wei } con_dict2 = { 'in_benchmark': in_benchmark, 'industry_max_expose': industry_max_expose, 'turnover': turnover, 'te': te, } q2 = ret_tmp2.values # P2.shape # q2.shape # ind_wei2.sum() # max_wei2.sum() cons = generates_constraints(para_dict2, con_dict2) prob = cp.Problem(cp.Maximize(q2.T * x - cp.quad_form(x, P2)), cons) prob.solve(solver=cp.ECOS_BB, feastol=1e-10) print(prob.status) if prob.status != 'optimal': input('input:二次股票数量优化时,未得出最优解,请检查') # winsound.Beep(600, 2000) # print(x.value) # print(y.value) # np.sum(x.value > 0.001) # np.sum(x.value) # np.sum(y.value) # np.sum(x.value[y.value == 1]) # # prob = cp.Problem(cp.Maximize(q.T * x - cp.quad_form(x, P)), # - cp.quad_form(x, P)), # constraints) # print(prob.is_dcp()) # prob.solve() # print(prob.status) # print(x.value) # # np.sum(x.value > 0.01) # # # np.vstack((a, b)) # 在垂直方向上拼接 # # np.hstack((a, b)) # 在水平方向上拼接 # industry_max_expose = 0.05 # # if max_num: # ''' # 优化目标函数: # ECOS is a numerical software for solving convex second-order cone programs (SOCPs) of type # min c'*x # s.t. A * x = b # G * x <= _K h # 步骤: # 1,假设股票数量没有约束,求解组合优化,得到绝对权重向量 # 2,对股票数量不过N_max的原始可行域进行限制,选股空间为w1中有权重(>1e-6),数量为n1 # 强制保留w1中各行业最大权重股以及其他权重靠前的股票,数量为n2,n2<N_max # 3,在第2步限制后的可行域内运用BB算法求解最优权重,设置最大迭代刺猬niters,超过 # 迭代次数返回截至目前的最优解。 # ''' # # 步骤1 # sol = solvers.qp(P, q, G, h, A, b) # wei = sol['x'] # print(wei) wei.size # wei_ar = np.array(wei).flatten() # wei_ar.size # n1 = np.sum(wei_ar > 0) # # np.sum(wei_ar[wei_ar > 0.01]) # wei_se = pd.Series(wei_ar, index=ret_tmp.index) # # 步骤2 # wei_selected, n2 = select_import_wei(wei_se) # # 步骤3 # wei_selected, n2 # # x = cp.Variable(len(ret_tmp), nonneg=True) # y = cp.Variable(len(ret_tmp), boolean=True) # prob = cp.Problem(cp.Maximize(q.T * x - cp.quad_form(x, P)), # - cp.quad_form(x, P)), # [ # G @ x <= h, # print(P) G.size h.size # x - y <= 0, # A @ x == b, # cp.sum(x) == 1, # cp.sum(y) <= 200, # ]) # print(prob.is_dcp()) # # max_iters: maximum number of iterations # # reltol: relative accuracy(default: 1e-8) # # feastol: tolerance for feasibility conditions (default: 1e-8) # # reltol_inacc: relative accuracy for inaccurate solution (default: 5e-5) # # feastol_inacc: tolerance for feasibility condition for inaccurate solution(default:1e-4) # # prob.solve(solver=cp.ECOS_BB, max_iters=20000, feastol=1e-4, reltol=1e-4, reltol_inacc=1e-4, # feastol_inacc=1e-4) # # prob.solve(solver=cp.ECOS_BB, max_iters=20, feastol=1e-5, reltol=1e-5, feastol_inacc=1e-1) # print(prob.status) # print(x.value) # print(y.value) # # max_num # # # pass # # # print(cvxpy.installed_solvers()) # # # # np.sum(A, axis=1) # # A.size # # np.linalg.matrix_rank(A) # # # # sol = solvers.qp(P, q, G, h, A, b) # # wei = sol['x'] # print(wei) wei.size # # # # np.rank(A) # # print(A) # # print(q.T * wei) # # # # # np.sum(wei) # # wei_ar = np.array(wei).flatten() # wei_ar.size # # # np.sum(wei_ar > 0.01) # # # np.sum(wei_ar[wei_ar > 0.01]) return wei_se
def load_pct(self): if self.freq == 'M': # 导入月度价格变动百分比数据 data = Data() self.changePCT_np = data.changepct_monthly.shift(-1, axis=1) if self.freq == 'D': print('未实现')
def get_firt_industry_list(): data = Data() stock_basic = data.stock_basic_inform industry_names = list(set(stock_basic['申万一级行业'].values)) return industry_names
def main(p_dict, fp, is_ind_neu, is_size_neu, is_plate_neu, special_plate=None, selection=None): """ 输入: 需要进行预处理的因子名称(可为1个或多个,默认为对所有因子进行预处理) is_ind_neu : 是否做行业中性化处理,对股票多因子需要,做行业多因子时不需要 输出: 预处理后的因子截面数据(如2009-01-23.csv文件) 对指定的原始因子数据进行预处理 顺序:缺失值填充、去极值、中性化、标准化 (因输入的截面数据中所含财务类因子默认已经过 财务日期对齐处理,故在此不再进行该步处理) """ file_path = p_dict['file_path'] save_path = p_dict['save_path'] # 读取原始因子截面数据 try: data = pd.read_csv(os.path.join(file_path, fp), engine='python', encoding='gbk') except Exception as e: print('debug') if 'No' in data.columns: data = data.set_index('No') # 若针对特定板块,则删除其他板块的股票数据 if special_plate: data_ = Data() stock_basic = data_.stock_basic_inform sw_1 = stock_basic[['申万一级行业']] stock_list = list(sw_1.index[sw_1[sw_1.columns[0]] == special_plate]) # 剔除在当期还未上市的股票 codes = [i for i in data.index if data.loc[i, 'Code'] in stock_list] data = data.loc[codes, :] data.index = range(0, len(data)) # 根据输入的因子名称将原始因子截面数据分割 data_to_process, data_unchanged = get_factor_data(data) # '002345.SZ' in data_to_process['Code'] # 预处理步骤依次进行 data_to_process = fill_na(data_to_process) # 缺失值填充 if len(data_to_process) == 0: print('debug') data_to_process = winsorize(data_to_process) # 去极值 if is_ind_neu or is_size_neu: data_to_process = neutralize(data_to_process, ind_neu=is_ind_neu, size_neu=is_size_neu) # 中性化 data_to_process = standardize(data_to_process) # 标准化 # 合并生成经过处理后的总因子文件 if len(data_unchanged) > 0: data_final = pd.concat([data_to_process, data_unchanged.loc[data_to_process.index]], axis=1) else: data_final = data_to_process if data_final.index.name != 'No': data_final.index = range(1, len(data_final)+1) data_final.index.name = 'No' data_final.to_csv(os.path.join(save_path, fp), encoding='gbk')