def get_factors_with_labels(self, md_df): factors = self.get_factors(md_df) price_arr = factors[:, 0] self.input_size = factors.shape[1] labels = self.calc_label_with_future_value(price_arr, -0.01, 0.01) idx_last_available_label = get_last_idx(labels, lambda x: x.sum() == 0) factors = factors[:idx_last_available_label + 1, :] labels = labels[:idx_last_available_label + 1, :] # if self.normalization_model: # factors = (factors - np.mean(factors, 0)) / np.std(factors, 0) return factors, labels
def transfer_report_2_daily(table_name_report: str, table_name_daily: str, table_name_trade_date: str, dtype_daily: dict, accumulation_col_name_list): """ 将财务数据(季度)保存成日级别数据 :param table_name_report: :param table_name_daily: :param table_name_trade_date: :param dtype_daily: :param accumulation_col_name_list: :return: """ if not engine_md.has_table(table_name_report): logger.info('%s 不存在,无需转化成日级别数据', table_name_report) today = datetime.date.today() has_table = engine_md.has_table(table_name_daily) # 获取每只股票最新的交易日,以及截至当期日期的全部交易日数据 with with_db_session(engine_md) as session: sql_str = f"""select trade_date from {table_name_trade_date} where trade_date<=:today order by trade_date""" table = session.execute(sql_str, params={"today": today}) trade_date_list = [_[0] for _ in table.fetchall()] if has_table: sql_str = f"""select code, max(trade_date) from {table_name_daily} group by code""" table = session.execute(sql_str) code_date_latest_dic = dict(table.fetchall()) else: code_date_latest_dic = {} # 获取季度、半年、年报财务数据 col_name_list = list(dtype_daily.keys()) col_name_list_str = ','.join( [f'report.`{col_name}` {col_name}' for col_name in col_name_list]) sql_str = f"""SELECT {col_name_list_str} FROM {table_name_report} report inner join ( select code, pub_date, max(report_date) report_date from {table_name_report} where report_type=0 group by code, pub_date ) base_date where report.report_type=0 and report.code = base_date.code and report.pub_date = base_date.pub_date and report.report_date = base_date.report_date order by code, pub_date""" dfg_by_code = pd.read_sql(sql_str, engine_md).set_index( 'report_date', drop=False).sort_index().groupby('code') dfg_len = len(dfg_by_code) data_new_s_list = [] # 按股票代码分组,分别按日进行处理 for num, (code, df_by_code) in enumerate(dfg_by_code, start=1): # df_by_code.sort_index(inplace=True) # 前面代码以及有此功能 df_by_code = df_by_code.copy() df_len = df_by_code.shape[0] # df_by_code.loc[:, ['pub_date_next', 'report_date_next']] = df_by_code[['pub_date', 'report_date']].shift(-1) df_by_code.loc[:, 'pub_date_next'] = df_by_code['pub_date'].shift(-1) df_by_code.loc[:, 'report_date_next'] = df_by_code['report_date'].shift( -1) # 将相关周期累加增长字段转化为季度增长字段 for col_name in accumulation_col_name_list: # df_by_code = fill_season_data(df_by_code, 'total_operating_revenue') df_by_code, col_name_season = fill_season_data( df_by_code, col_name) # 更新 dtype_daily if col_name_season not in dtype_daily: dtype_daily[col_name_season] = dtype_daily[col_name] # df_by_code['code'] = code trade_date_latest = code_date_latest_dic[ code] if code in code_date_latest_dic else None for num_sub, (report_date, data_s) in enumerate(df_by_code.T.items(), start=1): pub_date = data_s['pub_date'] # report_date = data_s['report_date'] pub_date_next = data_s['pub_date_next'] # 检查 最新交易日是否已经大于下一条财报日期,如果是则跳过当前数据 if not pd.isnull(trade_date_latest) and not pd.isnull( pub_date_next) and trade_date_latest > pub_date_next: continue # 获取 交易日区间 if pd.isnull(trade_date_latest): date_from_idx = get_first_idx(trade_date_list, lambda x: x >= pub_date) else: date_from_idx = get_first_idx(trade_date_list, lambda x: x > trade_date_latest) if pd.isnull(pub_date_next): date_to_idx = get_last_idx(trade_date_list, lambda x: x <= today) else: date_to_idx = get_last_idx(trade_date_list, lambda x: x < pub_date_next) if date_from_idx is None: logger.warning( '%s %d/%d) %d/%d) %s 没有找到有效的起始日期 pub_date: %s trade_date_latest: %s ', table_name_report, num, dfg_len, num_sub, df_len, code, pub_date, trade_date_latest) continue if date_to_idx is None: logger.warning( '%s %d/%d) %d/%d) %s 没有找到有效的截至日期 today: %s pub_date_next: %s ', table_name_report, num, dfg_len, num_sub, df_len, code, today, pub_date_next) continue if date_from_idx > date_to_idx: logger.warning( '%s %d/%d) %d/%d) %s %s > %s 不匹配 pub_date: %s trade_date_latest: %s today: %s pub_date_next: %s ', table_name_report, num, dfg_len, num_sub, df_len, code, trade_date_list[date_from_idx], trade_date_list[date_to_idx], pub_date, trade_date_latest, today, pub_date_next) continue logger.debug('%s %d/%d) %d/%d) %s [%s, %s) 预计转化 %d 条日级别数据,报告日:%s,', table_name_report, num, dfg_len, num_sub, df_len, code, trade_date_list[date_from_idx], trade_date_list[date_to_idx], date_to_idx - date_from_idx + 1, report_date) # 补充交易日区间的每日数据 for trade_date in trade_date_list[date_from_idx:(date_to_idx + 1)]: data_new_s = data_s.copy() data_new_s['trade_date'] = trade_date data_new_s_list.append(data_new_s) if len(data_new_s_list) > 0: data_count = save_data_2_daily_table(data_new_s_list, table_name_daily, dtype_daily) logger.info("%s %d/%d) %s %d 条记录被保存", table_name_report, num, dfg_len, code, data_count) data_new_s_list = []
def import_data(table_name, dtype, invoke_api, primary_keys=["index_symbol", "trade_date", "jq_code"], ts_code_set=None, is_debug=False, is_monthly=False): """ 插入股票日线数据到最近一个工作日-1。 如果超过 BASE_LINE_HOUR 时间,则获取当日的数据 :return: """ info_table = 'jq_index_info' # table_name = 'jq_index_stocks' # primary_keys = ["index_symbol", "trade_date", "jq_code"] logging.info("更新 %s 开始", table_name) has_table = engine_md.has_table(table_name) # 进行表格判断,确定是否含有tushare_stock_daily if has_table: sql_str = f""" SELECT jq_code, date_from, if(date_to<end_date, date_to, end_date) date_to FROM ( SELECT info.jq_code, ifnull(trade_date, start_date) date_from, end_date date_to, if(hour(now())<16, subdate(curdate(),1), curdate()) end_date FROM {info_table} info LEFT OUTER JOIN (SELECT index_symbol, adddate(max(trade_date),1) trade_date FROM {table_name} GROUP BY index_symbol) daily ON info.jq_code = daily.index_symbol ) tt WHERE date_from <= if(date_to<end_date, date_to, end_date) ORDER BY jq_code""" else: sql_str = f""" SELECT jq_code, date_from, if(date_to<end_date, date_to, end_date) date_to FROM ( SELECT info.jq_code, start_date date_from, end_date date_to, if(hour(now())<16, subdate(curdate(),1), curdate()) end_date FROM {info_table} info ) tt WHERE date_from <= if(date_to<end_date, date_to, end_date) ORDER BY jq_code""" logger.warning('%s 不存在,仅使用 tushare_stock_info 表进行计算日期范围', table_name) sql_trade_date_str = """ SELECT trade_date FROM jq_trade_date trddate WHERE trade_date <= if(hour(now())<16, subdate(curdate(),1), curdate()) ORDER BY trade_date""" with with_db_session(engine_md) as session: table = session.execute(sql_trade_date_str) trade_date_list = [row[0] for row in table.fetchall()] trade_date_list.sort() # 获取每只股票需要获取日线数据的日期区间 table = session.execute(sql_str) begin_time = None # 获取date_from,date_to,将date_from,date_to做为value值 code_date_range_dic = { ts_code: (date_from if begin_time is None else min([date_from, begin_time]), date_to) for ts_code, date_from, date_to in table.fetchall() if ts_code_set is None or ts_code in ts_code_set} # data_len = len(code_date_range_dic) data_df_list, data_count, all_data_count, data_len = [], 0, 0, len(code_date_range_dic) logger.info('%d records will been import into %s', data_len, table_name) # 将data_df数据,添加到data_df_list try: for num, (index_symbol, (date_from_tmp, date_to_tmp)) in enumerate(code_date_range_dic.items(), start=1): date_from_idx = get_first_idx(trade_date_list, lambda x: x >= date_from_tmp) date_to_idx = get_last_idx(trade_date_list, lambda x: x <= date_to_tmp) if date_from_idx is None or date_to_idx is None or date_from_idx > date_to_idx: logger.debug('%d/%d) %s [%s - %s] 跳过', num, data_len, index_symbol, trade_date_list[date_from_idx] if date_from_idx is not None else None, trade_date_list[date_to_idx] if date_to_idx is not None else None) continue if is_monthly: date_sample = trade_date_list[date_from_idx: (date_to_idx + 1)] date_sample = list( pd.Series(date_sample, index=pd.DatetimeIndex(date_sample) ).resample(rule='M', convention='end').last() ) else: date_sample = trade_date_list[date_from_idx: (date_to_idx + 1)] date_from, date_to = date_sample[0], date_sample[-1] trade_date_count = len(date_sample) logger.debug('%d/%d) 开始导入 %s [%s - %s] %d 个交易日的数据 %s', num, data_len, index_symbol, date_from, date_to, trade_date_count, '月度更新' if is_monthly else '') for trade_date in date_sample: data_df = invoke_api(index_symbol=index_symbol, trade_date=trade_date) # 把数据攒起来 if data_df is not None and data_df.shape[0] > 0: data_count += data_df.shape[0] data_df_list.append(data_df) # 大于阀值有开始插入 if data_count >= 1000: data_count = bunch_insert(data_df_list, table_name=table_name, dtype=dtype, primary_keys=primary_keys) all_data_count += data_count data_df_list, data_count = [], 0 if is_debug and len(data_df_list) > 1: break except: logger.exception("%s 获取数据异常", table_name) finally: # 导入数据库 if len(data_df_list) > 0: data_count = bunch_insert(data_df_list, table_name=table_name, dtype=dtype, primary_keys=primary_keys) all_data_count += data_count logging.info("更新 %s 结束 %d 条信息被更新", table_name, all_data_count)
def import_sectorconstituent(sector_code, sector_name, date_start, chain_param=None, exch_code='SZSE'): """ 导入 sector_code 板块的成分股 :param sector_code:默认"SZSE":"深圳" :param sector_name: :param date_start: :param chain_param: 在celery 中將前面結果做爲參數傳給後面的任務 :return: """ # 根据 exch_code 获取交易日列表 trade_date_list_sorted = get_trade_date_list_sorted(exch_code) if trade_date_list_sorted is None or len(trade_date_list_sorted) == 0: raise ValueError("没有交易日数据") trade_date_list_count = len(trade_date_list_sorted) # 格式化 日期字段 date_start = str_2_date(date_start) date_constituent_df_dict = {} idx_constituent_set_dic = {} # 从数据库中获取最近一个交易日的成分股列表,如果为空,则代表新导入数据 date, constituent_df date_latest, constituent_df = get_latest_constituent_df(sector_code) # date_constituent_df_dict[date] = constituent_df date_latest = str_2_date(date_latest) if date_latest is None or date_latest < date_start: idx_start = get_last_idx(trade_date_list_sorted, lambda x: x <= date_start) sec_df, _ = get_sectorconstituent_2_dic(sector_code, sector_name, date_start, idx_start, trade_date_list_sorted, date_constituent_df_dict, idx_constituent_set_dic) # 保存板块数据 sec_df.to_sql("wind_sectorconstituent", engine_md, if_exists='append', index=False) else: date_start = date_latest idx_start = get_last_idx(trade_date_list_sorted, lambda x: x <= date_start) date_constituent_df_dict[date_latest] = constituent_df idx_constituent_set_dic[idx_start] = set(constituent_df['wind_code']) # 设定日期字段 # idx_end = idx_start + span if idx_start + span < trade_date_list_count - 1 else trade_date_list_count -1 yesterday = date.today() - timedelta(days=1) idx_end = get_last_idx(trade_date_list_sorted, lambda x: x <= yesterday) if idx_start >= idx_end: return left_or_right = 1 recursion_get_sectorconstituent(idx_start, idx_end, trade_date_list_sorted, date_constituent_df_dict, idx_constituent_set_dic, left_or_right, sector_code, sector_name) # 剔除 date_start 点的数据,该日期数据以及纳入数据库 del date_constituent_df_dict[date_start] # 其他数据导入数据库 for num, (date_cur, sec_df) in enumerate(date_constituent_df_dict.items(), start=1): sec_df.to_sql("wind_sectorconstituent", engine_md, if_exists='append', index=False) logger.info("%d) %s %d 条 %s 成分股数据导入数据库", num, date_cur, sec_df.shape[0], sector_name) #仅仅调试时使用 if DEBUG and num >= 20: break
def import_index_constituent(index_code, index_name, date_start, exch_code='SZSE', date_end=None, method='loop'): """ 导入 sector_code 板块的成分股 :param index_code:默认"SZSE":"深圳" :param index_name: :param date_start: :param exch_code: :param date_end:默认为None,到最近交易日的历史数据 :return: """ table_name = 'wind_index_constituent' param_list = [ ('trade_date', Date), ('weight', DOUBLE), ('stock_name', String(80)), ('index_code', String(20)), ('index_name', String(80)), ] # sldksldDFGDFGD,Nlfkgldfngldldfngldnzncvxcvnx dtype = {key: val for key, val in param_list} dtype['wind_cod'] = String(20) # 根据 exch_code 获取交易日列表 trade_date_list_sorted = get_trade_date_list_sorted(exch_code) if trade_date_list_sorted is None or len(trade_date_list_sorted) == 0: raise ValueError("没有交易日数据") trade_date_list_count = len(trade_date_list_sorted) # 格式化 日期字段 date_start = str_2_date(date_start) if date_end is not None: date_end = str_2_date(date_end) idx_end = get_first_idx(trade_date_list_sorted, lambda x: x >= date_end) if idx_end is not None: trade_date_list_sorted = trade_date_list_sorted[:(idx_end + 1)] date_constituent_df_dict = OrderedDict() idx_constituent_set_dic = {} # 从数据库中获取最近一个交易日的成分股列表,如果为空,则代表新导入数据 date, constituent_df date_latest, constituent_df = get_latest_constituent_df(index_code) # date_constituent_df_dict[date] = constituent_df date_latest = str_2_date(date_latest) if date_latest is None or date_latest < date_start: idx_start = get_last_idx(trade_date_list_sorted, lambda x: x <= date_start) sec_df, _ = get_index_constituent_2_dic(index_code, index_name, date_start, idx_start, date_constituent_df_dict, idx_constituent_set_dic) if sec_df is None or sec_df.shape[0] == 0: return # 保存板块数据 # sec_df.to_sql(table_name, engine_md, if_exists='append', index=False) bunch_insert_on_duplicate_update(sec_df, table_name, engine_md, dtype=dtype) else: date_start = date_latest idx_start = get_last_idx(trade_date_list_sorted, lambda x: x <= date_start) date_constituent_df_dict[date_latest] = constituent_df idx_constituent_set_dic[idx_start] = set(constituent_df['wind_code']) # 设定日期字段 # idx_end = idx_start + span if idx_start + span < trade_date_list_count - 1 else trade_date_list_count -1 yesterday = date.today() - timedelta(days=1) idx_end = get_last_idx(trade_date_list_sorted, lambda x: x <= yesterday) if idx_start >= idx_end: return if method == 'loop': try: idx_end = idx_start + 10 # 调试使用 loop_get_data(idx_start + 1, idx_end, trade_date_list_sorted, date_constituent_df_dict, idx_constituent_set_dic, index_code, index_name) except APIError: logger.exception( 'loop_get_data (idx_start=%d, idx_end=%d, index_code=%s, index_name=%s)', idx_start, idx_end, index_code, index_name) elif method == 'recursion': left_or_right = 1 recursion_dichotomy_get_data(idx_start, idx_end, trade_date_list_sorted, date_constituent_df_dict, idx_constituent_set_dic, left_or_right, index_code, index_name) else: raise ValueError('method = %s error' % method) # 剔除 date_start 点的数据,该日期数据以及纳入数据库 del date_constituent_df_dict[date_start] # 其他数据导入数据库 for num, (date_cur, sec_df) in enumerate(date_constituent_df_dict.items(), start=1): # sec_df.to_sql(table_name, engine_md, if_exists='append', index=False) bunch_insert_on_duplicate_update(sec_df, table_name, engine_md, dtype=dtype) logger.info("%d) %s %d 条 %s 成分股数据导入数据库", num, date_cur, sec_df.shape[0], index_name)