def sqlize_db_industry(subdb): """ 将 industry sql化 @subdb (str): 子数据库名 """ db_path = DB_PATH_LIB['industry'] subdb_path = os.path.join(db_path, subdb) trading_days = listdir_advanced(subdb_path, 'json', strip_suffix=True) with SqliteProxy(log=False) as proxy: for year, dates in classify_dates_by_year(trading_days).items(): path = os.path.join(db_path, '{}.db'.format(year)) proxy.connect(path) if subdb not in proxy.list_tables: create_db_03(proxy, subdb) for date in dates: js = json2dict(os.path.join(subdb_path, '{}.json'.format(date))) df = pd.DataFrame(list(js.items()), columns=['sec_id', 'industry']) df['date'] = date try: proxy.write_from_dataframe(df, "A_SWL1") except Exception: Logger.error( "Error occurred when sqlizing {} on {}.".format( subdb, date)) traceback.print_exc()
def update_schema(db_name, sub_name): """ 更新schema相关的begin date,end date, last update 适用于非factor_return相关的数据库 @db_name (str): db的名称 eg. FACTOR 排除factor_return @sub_name (str): db中各子数据库的名称 eg. VALUE GROWTH """ schema = json2dict(os.path.join(DB_PATH_LIB[db_name], 'schema')) assert sub_name date_list = get_date_lists_in_table(DB_PATH_LIB[db_name], sub_name) schema[sub_name]['begin date'] = date_list[0] schema[sub_name]['end date'] = date_list[-1] schema[sub_name]['last update'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') Logger.info("schema updated: {}".format(sub_name)) dict2json(schema, os.path.join(DB_PATH_LIB[db_name], 'schema'), log=False) a = pd.DataFrame(schema).T col_names = [ 'aspect', 'type', 'begin date', 'end date', 'last update', 'col_names', 'field', 'kwargs', 'explanation' ] b = a.reindex(columns=col_names).reset_index().rename(columns={ 'index': 'indicator' }).sort_values(['type', 'aspect', 'field']) b.to_csv(os.path.join(DB_PATH_LIB[db_name], 'schema.csv'), index=False)
def get_schema(db_name): """ 获得某个数据库schema的相关信息 @db_name (int): 数据库名称 @return dict of schema """ path = os.path.join(SCHEMA_PATH, "schema_{}.json".format(db_name)) schema = json2dict(path) return schema
def get_schema(db_name): """ 获得某个数据库schema的相关信息 @db_name (int): 数据库名称 @return dict of schema """ schema = json2dict(SCHEMA_PATHS[db_name]) return schema
def mfs_by_score(date, windows_step1, num1): """ 打分法多因子选股: 给定指定日期,计算当天因子打分情况 返回选定股票 @date <"%Y-%m-%d">: 开始建仓日期 @windows_step1 <int>: 计算因子权重的滚动窗口 即根据近多少个交易日进行分组有效性检验 @num1 <int>: 持仓股票数 @return 选择的股票 """ index_info = dk.json2dict( os.path.join(PROJECT_FILES_PATH, "factor_weight", "cycle{}".format(str(windows_step1)), "{}.json".format(date))) index_std_all = pd.DataFrame() weight = {key: index_info[key]['weight'] for key in index_info} # 有效因子权重 monotony = {key: index_info[key]['ascending'] for key in index_info} # 因子单调方向 for index_std in weight: if len(index_std_all) == 0: index_std_all = get_secs_index_std( index_std, trading_days=[date])[['sec_id', index_std]] else: temp = get_secs_index_std(index_std, trading_days=[date ])[['sec_id', index_std]] index_std_all = index_std_all.merge(temp, how='inner', on=['sec_id']) index_std_all = index_std_all.set_index(['sec_id']) # 获取每个因子的单调方向 目的是使最优因子排名靠前 ascending = [key for key in monotony if monotony[key] == 1.0] # 反向因子 越小越好 descending = [key for key in monotony if monotony[key] == 0.0] # 正向因子 越大越好 part1 = index_std_all[descending].rank(ascending=False) # 对正向因子进行排序 part2 = index_std_all[ascending].rank(ascending=True) # 对反向因子进行排序 index_std_all1 = part1.merge(part2, how='inner', left_index=True, right_index=True) index_std_all2 = index_std_all1.copy() # 获取不同因子的加权排名 for col in index_std_all1.columns: index_std_all2[col] = index_std_all2[col] * weight[col] selected_stocks = index_std_all2.sum( axis=1).sort_values().index[:num1].tolist() # 选出排名前20的股票 return selected_stocks
def derive_next_month_rr(sec_ids, start_date, end_date): """ 获取给定股票列表在给定日期列表的下个月收益率 @sec_ids <list>: 股票列表 @start_date <"%Y-%m-%d">: 开始日期 @end_date <"%Y-%m-%d">: 结束日期 @return dataframe [sec_id yearmonth month_ret CLOSE] """ # 取月初月末的交易日列表 tds = dk.json2dict(os.path.join(DB_PATH, r"datasets\pre\trading_days.json")) tds = pd.DataFrame(tds, columns=['date']) tds = tds[tds.date.between(start_date, end_date)] tds['group'] = tds['date'].apply(lambda x: x[:7]) tds = tds.sort_values(['date']) first = tds.groupby('group').head(1) last = tds.groupby('group').tail(1) tds = first.append(last) tds = tds.sort_values('date') # 获取月初月末交易日的收盘价 close = DataAPI.api.get_secs_indicator_on_multidays( indicator="CLOSE", trading_days=tds.date.tolist(), sec_ids=sec_ids) close = close.sort_values(['sec_id', 'date']) close['yearmonth'] = close['date'].apply(lambda x: x[:7]) # 获取月收益率 def cal_ret(x): close_id = x.columns.tolist().index('CLOSE') return x.iloc[-1, close_id] / x.iloc[0, close_id] - 1 df_target = close.groupby(['sec_id', 'yearmonth'], as_index=False).apply(lambda x: cal_ret(x)) df_target = df_target.reset_index().rename(columns={0: 'month_ret'}) # 只取每个月月初的收盘价 close = close.groupby(['sec_id', 'yearmonth'], as_index=False).apply(lambda x: x.head(1)) df_target = df_target.merge(close, how='inner', on=['sec_id', 'yearmonth']) del df_target['date'] return df_target
def update_factor_return_schema(factor): """ 更新factor_return的schema相关的begin date,end date, last update @factor (str): factor的名称 """ schema = json2dict(os.path.join(DB_PATH_LIB['factor_return'], 'schema')) filepath = os.path.join(DB_PATH_LIB['factor_return'], "{}.csv".format(factor)) df = pd.read_csv(filepath, encoding="utf-8")["date"] schema[factor]['begin date'] = df.min() schema[factor]['end date'] = df.max() schema[factor]['last update'] = \ datetime.now().strftime('%Y-%m-%d %H:%M:%S') Logger.info("schema updated: {}".format(factor)) dict2json(schema, os.path.join(DB_PATH_LIB['factor_return'], 'schema'), log=False)
def get_secs_industry_from_json(industry_code, sec_ids=[]): path = os.path.join(DB_INDUSTRY, '{}.json'.format(industry_code)) info = json2dict(path, validate=True) return {sec: info[sec] for sec in sec_ids if sec in info}
def plot_LLJZ(yearmonth_st, yearmonth_end, model_name="XGBoost", label_kind="label_A", benchmark="hs300"): file_root = os.path.join(DB_PATH, "backtest", model_name, label_kind, benchmark) file_LJJZ = os.path.join(file_root, "LLJZ", "{}--{}".format(yearmonth_st, yearmonth_end)) if not os.path.exists(file_LJJZ): os.makedirs(file_LJJZ) year_st_num = int(yearmonth_st[:4]) year_end_num = int(yearmonth_end[:4]) diff = year_end_num - year_st_num yearmonth_lst = [ str(year_st_num + i) + '-{:0>2}'.format(str(1 + j)) for i in range(0, diff + 1) for j in range(0, 12) ] yearmonth_lst = [ yearmonth for yearmonth in yearmonth_lst if (yearmonth >= yearmonth_st) & (yearmonth <= yearmonth_end) ] records = {} for yearmonth in yearmonth_lst: records.update( dk.json2dict( os.path.join(file_root, "records_{}.json".format(yearmonth)))) date_st = dk.date2char( dk.char2datetime(yearmonth_st + '-31') - dk.timedelta({'months': 1})) records[date_st] = {'benchmark': 1, 'model': 1} df_records = pd.DataFrame(records).T.reset_index().rename( columns={'index': 'yearmonth'}) df_records.to_csv(os.path.join(file_LJJZ, "df_records.csv"), index=0) df_records_daily = df_records[['yearmonth', 'benchmark', 'model']] # 获取模型和基准累计净值 for ix in df_records_daily.index: if ix == 0: df_records_daily.loc[ix, 'model_rr_cum'] = 1 df_records_daily.loc[ix, 'benchmark_rr_cum'] = 1 else: df_records_daily.loc[ix, 'model_rr_cum'] = df_records_daily.loc[ ix - 1, 'model_rr_cum'] * (1 + df_records_daily.loc[ix, 'model']) df_records_daily.loc[ix, 'benchmark_rr_cum'] = df_records_daily.loc[ ix - 1, 'benchmark_rr_cum'] * ( 1 + df_records_daily.loc[ix, 'benchmark']) df_records_daily = df_records_daily[[ 'yearmonth', 'benchmark_rr_cum', 'model_rr_cum' ]].set_index('yearmonth') # df_records_daily .to_csv(os.path.join(back_test_path, "df_records_daily_{}.csv".format(dataID))) fig, ax = plt.subplots() fig.set_size_inches(20, 10) ax.plot(df_records_daily) ax.legend(df_records_daily.columns, fontsize=22) plt.title("基于测试集{}选股累计收益与{}累计净值比较图 {}--{}".format(model_name, benchmark, yearmonth_st, yearmonth_end), fontsize=22) plt.xlabel("日期", fontsize=22) plt.ylabel("累计净值", fontsize=22) plt.savefig(os.path.join(file_LJJZ, "LJJZ.png"))