def merge_ifind_stock_daily(ths_code_set: set = None, date_from=None): """将ds his 以及财务数据合并为 daily 数据""" table_name = 'ifind_stock_daily' logging.info("合成 %s 开始", table_name) has_table = engine_md.has_table(table_name) if date_from is None and has_table: sql_str = "select adddate(max(`time`),1) from {table_name}".format( table_name=table_name) with with_db_session(engine_md) as session: date_from = date_2_str(session.execute(sql_str).scalar()) # 獲取各個表格數據 ifind_his_df = get_ifind_daily_df('ifind_stock_daily_his', date_from) ifind_ds_df = get_ifind_daily_df('ifind_stock_daily_ds', date_from) ifind_report_date_df = get_ifind_report_date_df('ifind_stock_report_date', None) ifind_fin_df = get_ifind_daily_df('ifind_stock_fin', None) ifind_fin_df_g = ifind_fin_df.groupby('ths_code') ths_code_set_4_daily = set(ifind_fin_df_g.size().index) # 合并 ds his 数据 ifind_his_ds_df = pd.merge(ifind_his_df, ifind_ds_df, how='outer', on=['ths_code', 'time']) # 拼接後續有nan,無數據 ifind_his_ds_df_g = ifind_his_ds_df.groupby('ths_code') logger.debug("提取数据完成") # 计算 财报披露时间 report_date_dic_dic = {} for report_date_g in [ ifind_report_date_df.groupby( ['ths_code', 'ths_regular_report_actual_dd_stock']) ]: for num, ((ths_code, report_date), data_df) in enumerate(report_date_g, start=1): if ths_code_set is not None and ths_code not in ths_code_set: continue if is_nan_or_none(report_date): continue report_date_dic = report_date_dic_dic.setdefault(ths_code, {}) if ths_code not in ths_code_set_4_daily: logger.error('fin 表中不存在 %s 的財務數據', ths_code) continue ifind_fin_df_temp = ifind_fin_df_g.get_group(ths_code) if report_date not in report_date_dic_dic: ifind_fin_df_temp = ifind_fin_df_temp[ ifind_fin_df_temp['time'] <= report_date] if ifind_fin_df_temp.shape[0] > 0: report_date_dic[ report_date] = ifind_fin_df_temp.sort_values( 'time').iloc[0] # # 设置 dtype dtype = {'report_date': Date} for dic in [ DTYPE_STOCK_DAILY_DS, DTYPE_STOCK_REPORT_DATE, DTYPE_STOCK_DAILY_FIN, DTYPE_STOCK_DAILY_HIS ]: for key, val in dic.items(): dtype[key] = val logger.debug("计算财报日期完成") # 整理 data_df 数据 tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len( report_date_dic_dic) try: for num, (ths_code, report_date_dic) in enumerate(report_date_dic_dic.items(), start=1): # key:ths_code # TODO: 檢查判斷 ths_code 是否存在在ifind_fin_df_g 裏面,,size暫時使用 以後在驚醒改進 if ths_code not in ifind_his_ds_df_g.size(): logger.error('fin 表中不存在 %s 的財務數據', ths_code) continue # open low 等 is NAN 2438 ifind_his_ds_df_cur_ths_code = ifind_his_ds_df_g.get_group( ths_code) # shape[1] 30 logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ths_code, ifind_his_ds_df_cur_ths_code.shape[0]) report_date_list = list(report_date_dic.keys()) report_date_list.sort() for report_date_from, report_date_to in iter_2_range( report_date_list): logger.debug('%d/%d) 处理 %s [%s - %s]', num, for_count, ths_code, date_2_str(report_date_from), date_2_str(report_date_to)) # 计算有效的日期范围 if report_date_from is None: is_fit = ifind_his_ds_df_cur_ths_code[ 'time'] < report_date_to elif report_date_to is None: is_fit = ifind_his_ds_df_cur_ths_code[ 'time'] >= report_date_from else: is_fit = (ifind_his_ds_df_cur_ths_code['time'] < report_date_to) & ( ifind_his_ds_df_cur_ths_code['time'] >= report_date_from) # 获取日期范围内的数据 ifind_his_ds_df_segment = ifind_his_ds_df_cur_ths_code[ is_fit].copy() segment_count = ifind_his_ds_df_segment.shape[0] if segment_count == 0: continue fin_s = report_date_dic[ report_date_from] if report_date_from is not None else None for key in DTYPE_STOCK_DAILY_FIN.keys(): if key in ('ths_code', 'time'): continue ifind_his_ds_df_segment[key] = fin_s[ key] if fin_s is not None and key in fin_s else None ifind_his_ds_df_segment['report_date'] = report_date_from # 添加数据到列表 data_df_list.append(ifind_his_ds_df_segment) data_count += segment_count if DEBUG and len(data_df_list) > 1: break # 保存数据库 if data_count > 10000: # 保存到数据库 data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count data_count, data_df_list = 0, [] finally: # 保存到数据库 if len(data_df_list) > 0: data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) build_primary_key([table_name])
def merge_daily_balancesheet(ts_code_set: set=None): """ 合並 數據日表與 季度balancesheet 表格數據 :param data_from: :return: """ table_name = 'tushare_merge_daily_bala' data_balancesheet_df = get_tushare_daily_df('tushare_stock_balancesheet', None) tushare_daily_df = get_tushare_daily_df('tushare_stock_daily', None) # data_merge_two_df = pd.merge(data_balancesheet_df, tushare_daily_df) tushare_daily_df_g = tushare_daily_df.groupby('ts_code') data_balancesheet_df_g = data_balancesheet_df.groupby('ts_code') report_date_dic_dic = {} for report_date_g in [data_balancesheet_df.groupby(['ts_code', "f_ann_date"])]: # one date one date('ocde'.'open') for num, ((ts_code, report_date), data_df) in enumerate(report_date_g, start=1): if is_nan_or_none(report_date): continue if ts_code_set is not None and ts_code not in ts_code_set: continue report_date_dic = report_date_dic_dic.setdefault(ts_code, {}) temp = data_balancesheet_df_g.get_group(ts_code) report_date_dic[report_date] = temp.sort_values('f_ann_date').iloc[0] # report_date = data_balancesheet_df.groupby("f_ann_date").count().index # ts_code = [ts_code for ts_code in data_balancesheet_df.groupby("f_ann_date").count()['ts_code']] # # 设置 dtype dtype = {} for dic in [DTYPE_TUSHARE_SUSPEND, DTYPE_TUSHARE_DAILY_BASIC, DTYPE_TUSHARE_DAILY]: for key, val in dic.items(): dtype[key] = val logging.debug('數據整理') tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len(report_date_dic_dic) try: for num, (ts_code, report_date_dic) in enumerate(report_date_dic_dic.items(), start=1): # key:ts_code # open low 等 is NAN 2438 if ts_code not in tushare_daily_df_g.size(): continue tushare_his_ds_df_cur_ts_code = tushare_daily_df_g.get_group(ts_code) # shape[1] 30 logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ts_code, tushare_his_ds_df_cur_ts_code.shape[0]) report_date_list = list(report_date_dic.keys()) report_date_list.sort() for report_date_from, report_date_to in generate_range(report_date_list): logger.debug('%d/%d) 处理 %s [%s - %s]', num, for_count, ts_code, date_2_str(report_date_from), date_2_str(report_date_to)) if report_date_from is None: is_fit = tushare_his_ds_df_cur_ts_code['trade_date'] < report_date_to elif report_date_to is None: is_fit = tushare_his_ds_df_cur_ts_code['trade_date'] >= report_date_from else: is_fit = (tushare_his_ds_df_cur_ts_code['trade_date'] < report_date_to) & ( tushare_his_ds_df_cur_ts_code['trade_date'] >= report_date_from) # 获取日期范围内的数据 ifind_his_ds_df_segment = tushare_his_ds_df_cur_ts_code[is_fit].copy() segment_count = ifind_his_ds_df_segment.shape[0] if segment_count == 0: continue fin_s = report_date_dic[report_date_from] if report_date_from is not None else None for key in DTYPE_TUSHARE_STOCK_BALABCESHEET.keys(): if key in ('ths_code', 'time'): continue ifind_his_ds_df_segment[key] = fin_s[key] if fin_s is not None and key in fin_s else None ifind_his_ds_df_segment['report_date'] = report_date_from # 添加数据到列表 data_df_list.append(ifind_his_ds_df_segment) data_count += segment_count if report_date_from is '2018-08-16': break if DEBUG and len(data_df_list) > 1: break # 保存数据库 if data_count > 10000: # 保存到数据库 data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update(data_df, table_name, engine_md, dtype) tot_data_count += data_count data_count, data_df_list = 0, [] finally: # 保存到数据库 if len(data_df_list) > 0: data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update(data_df, table_name, engine_md, dtype) tot_data_count += data_count # logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count)
def merge_tushare_daily(ths_code_set: set = None, date_from=None): table_name = 'tushare_daily' logging.info("合成 %s 开始", table_name) has_table = engine_md.has_table(table_name) if date_from is None and has_table: sql_str = "select adddate(max(`trade_date`),1) from {table_name}".format( table_name=table_name) with with_db_session(engine_md) as session: date_from = date_2_str(session.execute(sql_str).scalar()) # 獲取各個表格數據 # daily tushare_his_df = get_tushare_daily_df('tushare_stock_daily', date_from) tushare_ds_df = get_tushare_daily_df('tushare_daily_basic', date_from) tushare_suspend_df = get_tushare_daily_df('tushare_suspend', None) # quarterly tushare_merge_df = get_tushare_daily_df('tushare_stock_cashflow', None) # tushare_balancesheet_df = get_tushare_daily_df('tushare_stock_balancesheet', None) # tushare_icome_df = get_tushare_daily_df('tushare_stock_income', None) # tushare_indicator_df = get_tushare_daily_df('tushare_stock_fina_indicator', None) # tushare_merge_quarterly_df = merge_tushare_quarterly(None) # # # tushare_merge_df_one = pd.merge(tushare_cashflow_df, tushare_balancesheet_df, # how='outer', on=['ts_code', 'end_date']) # tushare_merge_df_two = pd.merge(tushare_merge_df_one, tushare_icome_df, # how='outer', on=['ts_code', 'end_date']) # tushare_merge_df = pd.merge(tushare_merge_df_two, tushare_indicator_df, # how='outer', on=['ts_code', 'end_date']) tushare_his_dis_df = pd.merge(tushare_his_df, tushare_ds_df, how='outer', on=[ 'ts_code', 'trade_date', ]) tushare_his_ds_df = pd.merge(tushare_his_dis_df, tushare_suspend_df, how='outer', on='ts_code') tushare_merge_df_g = tushare_merge_df.groupby('ts_code') tushare_his_ds_df_g = tushare_his_ds_df.groupby('ts_code') logging.debug('提取数据完成') merge_date_dic_dic = {} # # 計算財務紕漏事件 for suspend_date_g in [tushare_merge_df.groupby(['ts_code', 'ann_date'])]: for num, ((ts_code, ann_date), date_df) in enumerate(suspend_date_g): if ths_code_set is not None and ts_code not in ths_code_set: continue if is_nan_or_none(ann_date): continue suspend_date_dic = merge_date_dic_dic.setdefault(ts_code, {}) if ts_code not in tushare_merge_df_g.size(): logger.error('hebing 表中不存在 %s 的財務數據', ts_code) continue tushare_merge_df_temp = tushare_merge_df_g.get_group(ts_code) if ann_date not in merge_date_dic_dic: tushare_merge_df_temp = tushare_merge_df_temp[ tushare_merge_df_temp['f_ann_date'] <= ann_date] if tushare_merge_df_temp.shape[0] > 0: suspend_date_dic[ ann_date] = tushare_merge_df_temp.sort_values( 'f_ann_date').iloc[0] # # 設置dtype dtype = {'ann_date': Date} for dic in [ DTYPE_TUSHARE_STOCK_DAILY_MD, DTYPE_TUSHARE_STOCK_DAILY_BASIC, DTYPE_TUSHARE_SUSPEND, # DTYPE_TUSHARE_STOCK_INCOME, # DTYPE_TUSHARE_STOCK_BALABCESHEET, DTYPE_TUSHARE_CASHFLOW ]: for key, val in dic.items(): # len(dic)12 dtype[key] = val logging.debug("计算财报日期完成") # 整理 data_df 数据 tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len( merge_date_dic_dic) try: for num, (ts_code, report_date_dic) in enumerate( merge_date_dic_dic.items(), start=1): # key:ts_code nan 較多 列明: nan # TODO: size暫時使用 以後在驚醒改進 if ts_code not in tushare_his_ds_df_g.size(): logger.error('suspend 表中不存在 %s 的財務數據', ts_code) continue tushare_his_ds_df_cur_ths_code = tushare_his_ds_df_g.get_group( ts_code) # shape[1] 30 logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ts_code, tushare_his_ds_df_cur_ths_code.shape[0]) report_date_list = list(suspend_date_dic.keys()) report_date_list.sort() for report_date_from, report_date_to in generate_range( report_date_list): logger.debug('%d/%d) 处理 %s [%s - %s]', num, for_count, ts_code, date_2_str(report_date_from), date_2_str(report_date_to)) # 计算有效的日期范围 if report_date_from is None: is_fit = tushare_his_ds_df_cur_ths_code[ 'trade_date'] < report_date_to elif report_date_to is None: is_fit = tushare_his_ds_df_cur_ths_code[ 'trade_date'] >= report_date_from else: is_fit = (tushare_his_ds_df_cur_ths_code['trade_date'] < report_date_to) & ( tushare_his_ds_df_cur_ths_code['trade_date'] >= report_date_from) # 获取日期范围内的数据 tushare_his_ds_df_segment = tushare_his_ds_df_cur_ths_code[ is_fit].copy() segment_count = tushare_his_ds_df_segment.shape[0] if segment_count == 0: continue fin_s = report_date_dic[ report_date_from] if report_date_from is not None else None ################################################# for key in ( DTYPE_TUSHARE_STOCK_DAILY_MD.keys() and DTYPE_TUSHARE_STOCK_DAILY_BASIC.keys() and DTYPE_TUSHARE_SUSPEND.keys() and DTYPE_TUSHARE_CASHFLOW.keys() # and # DTYPE_TUSHARE_STOCK_BALABCESHEET.keys() and DTYPE_TUSHARE_STOCK_INCOME.keys() ): if key in ('ts_code', 'trade_date'): continue tushare_his_ds_df_segment[key] = fin_s[ key] if fin_s is not None and key in fin_s else None tushare_his_ds_df_segment['ann_date'] = report_date_from # 添加数据到列表 data_df_list.append(tushare_his_ds_df_segment) data_count += segment_count if DEBUG and len(data_df_list) > 1: break # 保存数据库 if data_count > 10000: # 保存到数据库 data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count data_count, data_df_list = 0, [] finally: # 保存到数据库 report_date if len(data_df_list) > 0: data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) build_primary_key([table_name])
def merge_tushare_stock_daily(ths_code_set: set = None, date_from=None): """A股行情数据、财务信息 合并成为到 日级别数据""" table_name = 'tushare_stock_daily' logging.info("合成 %s 开始", table_name) has_table = engine_md.has_table(table_name) if date_from is None and has_table: sql_str = "select adddate(max(`time`),1) from {table_name}".format( table_name=table_name) with with_db_session(engine_md) as session: date_from = date_2_str(session.execute(sql_str).scalar()) # 获取日级别数据 # TODO: 增加 ths_code_set 参数 daily_df, dtype_daily = get_tushare_daily_merged_df( ths_code_set, date_from) daily_df_g = daily_df.groupby('ts_code') ths_code_set_4_daily = set(daily_df_g.size().index) # 获取合并后的财务数据 ifind_fin_df, dtype_fin = get_tushre_merge_stock_fin_df() # 整理 dtype dtype = dtype_daily.copy() dtype.update(dtype_fin) logging.debug("提取财务数据完成") # 计算 财报披露时间 report_date_dic_dic = {} for num, ((ths_code, report_date), data_df) in enumerate( ifind_fin_df.groupby(['ts_code', 'f_ann_date']), start=1): if ths_code_set is not None and ths_code not in ths_code_set: continue if is_nan_or_none(report_date): continue report_date_dic = report_date_dic_dic.setdefault(ths_code, {}) if report_date not in report_date_dic_dic: if data_df.shape[0] > 0: report_date_dic[report_date] = data_df.iloc[0] logger.debug("计算财报日期完成") # 整理 data_df 数据 tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len( report_date_dic_dic) try: for num, (ths_code, report_date_dic) in enumerate(report_date_dic_dic.items(), start=1): # key:ths_code # TODO: 檢查判斷 ths_code 是否存在在ifind_fin_df_g 裏面,,size暫時使用 以後在驚醒改進 if ths_code not in ths_code_set_4_daily: logger.error('fin 表中不存在 %s 的財務數據', ths_code) continue daily_df_cur_ts_code = daily_df_g.get_group(ths_code) logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ths_code, daily_df_cur_ts_code.shape[0]) report_date_list = list(report_date_dic.keys()) report_date_list.sort() report_date_list_len = len(report_date_list) for num_sub, (report_date_from, report_date_to) in enumerate( generate_range(report_date_list)): logger.debug('%d/%d) %d/%d) 处理 %s [%s - %s]', num, for_count, num_sub, report_date_list_len, ths_code, date_2_str(report_date_from), date_2_str(report_date_to)) # 计算有效的日期范围 if report_date_from is None: is_fit = daily_df_cur_ts_code['trade_date'] < report_date_to elif report_date_to is None: is_fit = daily_df_cur_ts_code[ 'trade_date'] >= report_date_from else: is_fit = (daily_df_cur_ts_code['trade_date'] < report_date_to) & ( daily_df_cur_ts_code['trade_date'] >= report_date_from) # 获取日期范围内的数据 ifind_his_ds_df_segment = daily_df_cur_ts_code[is_fit].copy() segment_count = ifind_his_ds_df_segment.shape[0] if segment_count == 0: continue fin_s = report_date_dic[ report_date_from] if report_date_from is not None else None for key in dtype_fin.keys(): if key in ('ts_code', 'trade_date'): continue ifind_his_ds_df_segment[key] = fin_s[ key] if fin_s is not None and key in fin_s else None ifind_his_ds_df_segment['report_date'] = report_date_from # 添加数据到列表 data_df_list.append(ifind_his_ds_df_segment) data_count += segment_count if DEBUG and len(data_df_list) > 1: break # 保存数据库 if data_count > 10000: # 保存到数据库 data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count data_count, data_df_list = 0, [] finally: # 保存到数据库 if len(data_df_list) > 0: data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) build_primary_key([table_name])