def summary_release_2_docx(title, img_meta_dic_list, stg_run_id=None, enable_clean_cache=True): """ 生成 预测成功率趋势报告 :param title: :param img_meta_dic_list: :param stg_run_id: :param enable_clean_cache: :return: """ logger.debug('生成报告开始') # 生成 docx 文件 document = docx.Document() # 设置默认字体 document.styles['Normal'].font.name = '微软雅黑' document.styles['Normal']._element.rPr.rFonts.set(docx.oxml.ns.qn('w:eastAsia'), '微软雅黑') # 创建自定义段落样式(第一个参数为样式名, 第二个参数为样式类型, 1为段落样式, 2为字符样式, 3为表格样式) UserStyle1 = document.styles.add_style('UserStyle1', 1) # 设置字体尺寸 UserStyle1.font.size = docx.shared.Pt(40) # 设置字体颜色 UserStyle1.font.color.rgb = docx.shared.RGBColor(0xff, 0xde, 0x00) # 居中文本 UserStyle1.paragraph_format.alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER # 设置中文字体 UserStyle1.font.name = '微软雅黑' UserStyle1._element.rPr.rFonts.set(docx.oxml.ns.qn('w:eastAsia'), '微软雅黑') # 文件内容 document.add_heading(title, 0).alignment = docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER document.add_paragraph('') document.add_paragraph('') heading_size = 1 for num, info_dic in enumerate(img_meta_dic_list, start=1): trade_date_last_train = info_dic['trade_date_last_train'] trade_date_end = info_dic['trade_date_end'] document.add_heading( f"{num}、{date_2_str(trade_date_last_train)} - {date_2_str(trade_date_end)}", heading_size) split_point_list = info_dic['split_point_list'] if split_point_list is None: p = document.add_paragraph(f"{num}.1) 日期区间段1个:\n") p.add_run(f'\t1) {date_2_str(trade_date_last_train)} ~ {date_2_str(trade_date_end)}\n') else: p = document.add_paragraph(f"{num}.1) 日期区间段{len(split_point_list) - 1}个:\n") for num2, (point1, point2) in enumerate( iter_2_range(split_point_list, has_left_outer=False, has_right_outer=False), start=1): p.add_run(f'\t{num2}) {date_2_str(point1)} ~ {date_2_str(point2)}\n') document.add_paragraph(f"{num}.2) 模型路径:\n\t{info_dic['module_file_path']}") document.add_paragraph(f"{num}.3) 取样状态(random_state):\n\t{info_dic['predict_test_random_state']}") document.add_paragraph(f"{num}.4) 展示数据长度:\n\t{info_dic['in_range_count']}") document.add_paragraph(f"{num}.5) 预测准确率趋势图:") document.add_picture(info_dic['img_file_path']) file_name = f"{title}.docx" file_path = os.path.join(get_report_folder_path(stg_run_id), file_name) document.save(file_path) if enable_clean_cache: clean_cache() logger.debug('生成报告结束。%s', file_path) return file_path
def get_df_iter(self, date_start, date_end, step, df_len_limit=3000, deep=0): """ 获取日期范围内的数据,当数据记录大于上限条数时,将日期范围进行二分法拆分,迭代进行查询 :param date_start: :param date_end: :param step: :param df_len_limit: :param deep: :return: """ for num, (date_from, date_to) in enumerate(iter_2_range( range_date(date_start, date_end, step), has_left_outer=False, has_right_outer=False), start=1): q = query(self.statement).filter( self.statement.pub_date > date_2_str(date_from), self.statement.pub_date <= date_2_str(date_to)) df = finance.run_query(q) df_len = df.shape[0] if df_len >= df_len_limit: if step >= 2: self.logger.warning( '%s%s%d) [%s ~ %s] 包含 %d 条数据,可能已经超越 %d 条提取上限,开始进一步分割日期', self.table_name, ' ' * deep, num, date_from, date_to, df_len, df_len_limit) yield from self.get_df_iter(date_from, date_to, step // 2, deep=deep + 1) else: self.logger.warning( '%s%s%d) [%s ~ %s] 包含 %d 条数据,可能已经超越 %d 条提取上限且无法再次分割日期范围,手动需要补充提取剩余数据', self.table_name, ' ' * deep, num, date_from, date_to, df_len, df_len_limit) yield df, date_from, date_to else: self.logger.debug('%s%s%d) [%s ~ %s] 包含 %d 条数据', self.table_name, ' ' * deep, num, date_from, date_to, df_len) yield df, date_from, date_to
def merge_ifind_stock_daily(ths_code_set: set = None, date_from=None): """将ds his 以及财务数据合并为 daily 数据""" table_name = 'ifind_stock_daily' logging.info("合成 %s 开始", table_name) has_table = engine_md.has_table(table_name) if date_from is None and has_table: sql_str = "select adddate(max(`time`),1) from {table_name}".format( table_name=table_name) with with_db_session(engine_md) as session: date_from = date_2_str(session.execute(sql_str).scalar()) # 獲取各個表格數據 ifind_his_df = get_ifind_daily_df('ifind_stock_daily_his', date_from) ifind_ds_df = get_ifind_daily_df('ifind_stock_daily_ds', date_from) ifind_report_date_df = get_ifind_report_date_df('ifind_stock_report_date', None) ifind_fin_df = get_ifind_daily_df('ifind_stock_fin', None) ifind_fin_df_g = ifind_fin_df.groupby('ths_code') ths_code_set_4_daily = set(ifind_fin_df_g.size().index) # 合并 ds his 数据 ifind_his_ds_df = pd.merge(ifind_his_df, ifind_ds_df, how='outer', on=['ths_code', 'time']) # 拼接後續有nan,無數據 ifind_his_ds_df_g = ifind_his_ds_df.groupby('ths_code') logger.debug("提取数据完成") # 计算 财报披露时间 report_date_dic_dic = {} for report_date_g in [ ifind_report_date_df.groupby( ['ths_code', 'ths_regular_report_actual_dd_stock']) ]: for num, ((ths_code, report_date), data_df) in enumerate(report_date_g, start=1): if ths_code_set is not None and ths_code not in ths_code_set: continue if is_nan_or_none(report_date): continue report_date_dic = report_date_dic_dic.setdefault(ths_code, {}) if ths_code not in ths_code_set_4_daily: logger.error('fin 表中不存在 %s 的財務數據', ths_code) continue ifind_fin_df_temp = ifind_fin_df_g.get_group(ths_code) if report_date not in report_date_dic_dic: ifind_fin_df_temp = ifind_fin_df_temp[ ifind_fin_df_temp['time'] <= report_date] if ifind_fin_df_temp.shape[0] > 0: report_date_dic[ report_date] = ifind_fin_df_temp.sort_values( 'time').iloc[0] # # 设置 dtype dtype = {'report_date': Date} for dic in [ DTYPE_STOCK_DAILY_DS, DTYPE_STOCK_REPORT_DATE, DTYPE_STOCK_DAILY_FIN, DTYPE_STOCK_DAILY_HIS ]: for key, val in dic.items(): dtype[key] = val logger.debug("计算财报日期完成") # 整理 data_df 数据 tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len( report_date_dic_dic) try: for num, (ths_code, report_date_dic) in enumerate(report_date_dic_dic.items(), start=1): # key:ths_code # TODO: 檢查判斷 ths_code 是否存在在ifind_fin_df_g 裏面,,size暫時使用 以後在驚醒改進 if ths_code not in ifind_his_ds_df_g.size(): logger.error('fin 表中不存在 %s 的財務數據', ths_code) continue # open low 等 is NAN 2438 ifind_his_ds_df_cur_ths_code = ifind_his_ds_df_g.get_group( ths_code) # shape[1] 30 logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ths_code, ifind_his_ds_df_cur_ths_code.shape[0]) report_date_list = list(report_date_dic.keys()) report_date_list.sort() for report_date_from, report_date_to in iter_2_range( report_date_list): logger.debug('%d/%d) 处理 %s [%s - %s]', num, for_count, ths_code, date_2_str(report_date_from), date_2_str(report_date_to)) # 计算有效的日期范围 if report_date_from is None: is_fit = ifind_his_ds_df_cur_ths_code[ 'time'] < report_date_to elif report_date_to is None: is_fit = ifind_his_ds_df_cur_ths_code[ 'time'] >= report_date_from else: is_fit = (ifind_his_ds_df_cur_ths_code['time'] < report_date_to) & ( ifind_his_ds_df_cur_ths_code['time'] >= report_date_from) # 获取日期范围内的数据 ifind_his_ds_df_segment = ifind_his_ds_df_cur_ths_code[ is_fit].copy() segment_count = ifind_his_ds_df_segment.shape[0] if segment_count == 0: continue fin_s = report_date_dic[ report_date_from] if report_date_from is not None else None for key in DTYPE_STOCK_DAILY_FIN.keys(): if key in ('ths_code', 'time'): continue ifind_his_ds_df_segment[key] = fin_s[ key] if fin_s is not None and key in fin_s else None ifind_his_ds_df_segment['report_date'] = report_date_from # 添加数据到列表 data_df_list.append(ifind_his_ds_df_segment) data_count += segment_count if DEBUG and len(data_df_list) > 1: break # 保存数据库 if data_count > 10000: # 保存到数据库 data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count data_count, data_df_list = 0, [] finally: # 保存到数据库 if len(data_df_list) > 0: data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) build_primary_key([table_name])
return pd.DataFrame(data_list) def get_ifind_daily_df(table_name, date_from) -> pd.DataFrame: if date_from is None: sql_str = "select * from {table_name}".format(table_name=table_name) data_df = pd.read_sql(sql_str, engine_md) # , index_col='ths_code' else: sql_str = "select * from {table_name} where time >= %s".format( table_name=table_name) data_df = pd.read_sql(sql_str, engine_md, params=[date_from]) # , index_col='ths_code' return data_df def get_wind_daily_df(table_name, date_from) -> pd.DataFrame: if date_from is None: sql_str = "select * from {table_name}".format(table_name=table_name) data_df = pd.read_sql(sql_str, engine_md) # , index_col='ths_code' else: sql_str = "select * from {table_name} where time >= %s".format( table_name=table_name) data_df = pd.read_sql(sql_str, engine_md, params=[date_from]) # , index_col='ths_code' return data_df if __name__ == "__main__": for x in iter_2_range([1, 2, 3]): print(x)
def plot_industry_classified_mid(col_name='ev2_to_ebitda'): # sql_str = """select sector_code, sector_name,base.trade_date, sum(ev2_to_ebitda) tot_val # from ( # SELECT * FROM fof_ams_dev.wind_sectorconstituent where sector_name like 'cs%%' # ) base # LEFT JOIN # ( # select trade_date, wind_code, ev2_to_ebitda from wind_stock_daily where ev2_to_ebitda is not null # ) val # on base.trade_date = val.trade_date # and base.wind_code = val.wind_code # group by sector_code, base.trade_date # having tot_val is not null""" # TODO: 待行业数据下载齐全后可生成相应的分布图 sector_sql_str = """SELECT sector_name, trade_date, wind_code FROM fof_ams_dev.wind_sectorconstituent where sector_name like 'cs%'""" with with_db_session(engine_md) as session: table = session.execute(sector_sql_str) sector_trade_date_wind_code_list_dic = defaultdict(dict) num = 0 for num, (sector_name, trade_date, wind_code) in enumerate(table.fetchall(), start=1): if sector_name not in sector_trade_date_wind_code_list_dic: sector_trade_date_wind_code_list_dic[sector_name] = { 'trade_date_set': set(), 'trade_date_wind_code_list_dic': defaultdict(list) } sector_trade_date_wind_code_list_dic[sector_name][ 'trade_date_set'].add(trade_date) sector_trade_date_wind_code_list_dic[sector_name][ 'trade_date_wind_code_list_dic'][trade_date].append(wind_code) sector_count = len(sector_trade_date_wind_code_list_dic) logger.debug('获取行业数据 %d 条 %d 个行业', num, sector_count) stock_sql_str = f"""select wind_code, trade_date, `{col_name}` from wind_stock_daily where `{col_name}` is not null""" data_df = pd.read_sql(stock_sql_str, engine_md) logger.debug('获取行情数据 %d 条', data_df.shape[0]) pivot_df = data_df.pivot(index='trade_date', columns='wind_code', values=col_name).sort_index() logger.debug('转换数据 %s', pivot_df.shape) sector_trade_date_val_list_dic, sector_trade_date_val_dic = {}, {} logger.debug('计算 %d 个行业中位数', sector_count) for num, (sector_name, data_dic) in enumerate( sector_trade_date_wind_code_list_dic.items(), start=1): trade_date_list = list(data_dic['trade_date_set']) trade_date_list.sort() trade_date_list_len = len(trade_date_list) logger.debug('%d/%d) %s %d 个交易日', num, sector_count, sector_name, trade_date_list_len) trade_date_wind_code_list_dic = data_dic[ 'trade_date_wind_code_list_dic'] # for trade_date, wind_code_list in trade_date_wind_code_list_dic.items(): for num2, (trade_date_from, trade_date_to) in enumerate(iter_2_range( trade_date_list, has_left_outer=False), start=1): wind_code_list = trade_date_wind_code_list_dic[trade_date_from] # logger.debug('%d/%d) [%d/%d] %s [%s %s)', num, sector_count, num2, trade_date_list_len, # sector_name, trade_date_from, trade_date_to, ) # 计算中位数 try: tmp_df = pivot_df.loc[trade_date_from:trade_date_to, wind_code_list] if tmp_df.shape[0] == 0: continue except KeyError: continue val_s = tmp_df.median(axis=1) if trade_date_to is not None: # 去除最后一天 val_s = val_s.iloc[:-1] # 保存到dict if sector_name not in sector_trade_date_val_list_dic: sector_trade_date_val_list_dic[sector_name] = [val_s] else: sector_trade_date_val_list_dic[sector_name].append(val_s) # 合并计算结果成为 一个 Series if sector_name in sector_trade_date_val_list_dic and len( sector_trade_date_val_list_dic[sector_name]) > 0: logger.debug('%s %d 个交易日合并数据', sector_name, len(trade_date_list)) sector_trade_date_val_dic[sector_name] = pd.concat( sector_trade_date_val_list_dic[sector_name]) # 数据合并 # 将所有 sector 的 数据合并成为 DataFrame logger.debug('合并 %d 个行业数据', sector_count) data_df = pd.DataFrame(sector_trade_date_val_dic) data_df.to_excel('median.xls', legend=False) data_df.plot() plt.show()