def get_sectorconstituent_2_dic(sector_code, sector_name, date_start, idx_start, trade_date_list_sorted, date_constituent_df_dict, idx_constituent_set_dic): """ 通过 wind 获取板块成分股 存入 date_constituent_df_dict idx_constituent_set_dic :param sector_code: :param sector_name: :param date_start: :param idx_start: :param trade_date_list_sorted: :param date_constituent_df_dict: :param idx_constituent_set_dic: :return: """ if date_start in date_constituent_df_dict and idx_start in idx_constituent_set_dic: date_start_str = date_2_str(date_start) logger.debug('%s %s %s 成分股 已经存在 直接返回', date_start_str, sector_name, sector_code) sec_df = date_constituent_df_dict[date_start] constituent_set_left = idx_constituent_set_dic[idx_start] else: sec_df = get_sectorconstituent(sector_code, sector_name, date_start) if sec_df is None or sec_df.shape[0] == 0: date_start_str = date_2_str(date_start) logger.warning('%s 无法获取到 %s %s 成分股', date_start_str, sector_name, sector_code) raise ValueError('%s 无法获取到 %s %s 成分股' % (date_start_str, sector_name, sector_code)) date_constituent_df_dict[date_start] = sec_df constituent_set_left = set(sec_df['wind_code']) idx_constituent_set_dic[idx_start] = constituent_set_left return sec_df, constituent_set_left
def import_jq_stock_income(chain_param=None, ts_code_set=None): """ 插入股票日线数据到最近一个工作日-1。 如果超过 BASE_LINE_HOUR 时间,则获取当日的数据 :return: """ logger.info("更新 %s 开始", TABLE_NAME) has_table = engine_md.has_table(TABLE_NAME) # 判断表是否已经存在 if has_table: sql_str = f"""select max(pub_date) from {TABLE_NAME}""" date_start = execute_scalar(engine_md, sql_str) logger.info('查询 %s 数据使用起始日期 %s', TABLE_NAME, date_2_str(date_start)) else: date_start = BASE_DATE logger.warning('%s 不存在,使用基础日期 %s', TABLE_NAME, date_2_str(date_start)) # 查询最新的 pub_date date_end = datetime.date.today() if date_start >= date_end: logger.info('%s 已经是最新数据,无需进一步获取', date_start) return data_count_tot = 0 try: for num, (df, date_from, date_to) in enumerate(get_df_iter(date_start, date_end, LOOP_STEP)): # logger.debug('%d) [%s ~ %s] 包含 %d 条数据', num, date_from, date_to, df.shape[0]) data_count = bunch_insert_on_duplicate_update( df, TABLE_NAME, engine_md, dtype=DTYPE, myisam_if_create_table=True, primary_keys=['id'], schema=config.DB_SCHEMA_MD) data_count_tot += data_count finally: # 导入数据库 logging.info("更新 %s 结束 %d 条信息被更新", TABLE_NAME, data_count_tot)
def get_index_constituent_2_dic(index_code, index_name, date_start, idx_start, date_constituent_df_dict, idx_constituent_set_dic): """ 通过 wind 获取指数成分股及权重 存入 date_constituent_df_dict idx_constituent_set_dic :param index_code: :param index_name: :param date_start: :param idx_start: :param date_constituent_df_dict: :param idx_constituent_set_dic: :return: """ if date_start in date_constituent_df_dict and idx_start in idx_constituent_set_dic: date_start_str = date_2_str(date_start) logger.debug('%s %s %s 成分股 已经存在 直接返回', date_start_str, index_name, index_code) sec_df = date_constituent_df_dict[date_start] constituent_set = idx_constituent_set_dic[idx_start] else: sec_df = get_sectorconstituent(index_code, index_name, date_start) if sec_df is None or sec_df.shape[0] == 0: date_start_str = date_2_str(date_start) logger.warning('%s 无法获取到 %s %s 成分股', date_start_str, index_name, index_code) # raise ValueError('%s 无法获取到 %s %s 成分股' % (date_start_str, index_name, index_code)) return None, None date_constituent_df_dict[date_start] = sec_df # constituent_set = set(sec_df['wind_code']) constituent_set = { tuple(val) for key, val in sec_df[['wind_code', 'weight']].T.items() } idx_constituent_set_dic[idx_start] = constituent_set return sec_df, constituent_set
def get_df_iter(date_start, date_end, step, df_len_limit=3000, deep=0): """ 获取日期范围内的数据,当数据记录大于上限条数时,将日期范围进行二分法拆分,迭代进行查询 :param date_start: :param date_end: :param step: :param df_len_limit: :param deep: :return: """ for num, (date_from, date_to) in enumerate(iter_2_range(range_date( date_start, date_end, step), has_left_outer=False, has_right_outer=False), start=1): q = query(finance.STK_INCOME_STATEMENT).filter( finance.STK_INCOME_STATEMENT.pub_date > date_2_str(date_from), finance.STK_INCOME_STATEMENT.pub_date <= date_2_str(date_to)) df = finance.run_query(q) df_len = df.shape[0] if df_len >= df_len_limit: if step >= 2: logger.warning('%s%d) [%s ~ %s] 包含 %d 条数据,可能已经超越 %d 条提取上限,开始进一步分割日期', ' ' * deep, num, date_from, date_to, df_len, df_len_limit) yield from get_df_iter(date_from, date_to, step // 2, deep=deep + 1) else: logger.warning('%s%d) [%s ~ %s] 包含 %d 条数据,可能已经超越 %d 条提取上限且无法再次分割日期范围,手动需要补充提取剩余数据', ' ' * deep, num, date_from, date_to, df_len, df_len_limit) yield df, date_from, date_to else: logger.debug('%s%d) [%s ~ %s] 包含 %d 条数据', ' ' * deep, num, date_from, date_to, df_len) yield df, date_from, date_to
def save(self): self.logger.info("更新 %s 开始", self.table_name) has_table = engine_md.has_table(self.table_name) # 判断表是否已经存在 if has_table: sql_str = f"""select max(pub_date) from {self.table_name}""" date_start = execute_scalar(engine_md, sql_str) self.logger.info('查询 %s 数据使用起始日期 %s', self.table_name, date_2_str(date_start)) else: date_start = self.BASE_DATE self.logger.warning('%s 不存在,使用基础日期 %s', self.table_name, date_2_str(date_start)) # 查询最新的 pub_date date_end = datetime.date.today() if date_start >= date_end: self.logger.info('%s %s 已经是最新数据,无需进一步获取', self.table_name, date_start) return data_count_tot = 0 try: for num, (df, date_from, date_to) in enumerate(self.get_df_iter(date_start, date_end, self.loop_step)): # logger.debug('%d) [%s ~ %s] 包含 %d 条数据', num, date_from, date_to, df.shape[0]) if df is not None and df.shape[0] > 0: data_count = bunch_insert_on_duplicate_update( df, self.table_name, engine_md, dtype=self.dtype, myisam_if_create_table=True, primary_keys=['id'], schema=config.DB_SCHEMA_MD) data_count_tot += data_count finally: # 导入数据库 logging.info("更新 %s 结束 %d 条信息被更新", self.table_name, data_count_tot)
def get_wind_kv_per_year(wind_code, wind_indictor_str, date_from, date_to, params): """ \ :param wind_code: :param wind_indictor_str: :param date_from: :param date_to: :param params: "year=%(year)d;westPeriod=180"== > "year=2018;westPeriod=180" :return: """ date_from, date_to = str_2_date(date_from), str_2_date(date_to) # 以年底为分界线,将日期范围截取成以自然年为分段的日期范围 date_pair = [] if date_from <= date_to: date_curr = date_from while True: date_new_year = str_2_date("%d-01-01" % (date_curr.year + 1)) date_year_end = date_new_year - timedelta(days=1) if date_to < date_year_end: date_pair.append((date_curr, date_to)) break else: date_pair.append((date_curr, date_year_end)) date_curr = date_new_year data_df_list = [] for date_from_sub, date_to_sub in date_pair: params_sub = params % {'year': (date_from_sub.year + 1)} try: data_df = invoker.wsd(wind_code, wind_indictor_str, date_from_sub, date_to_sub, params_sub) except APIError as exp: logger.exception("%s %s [%s ~ %s] %s 执行异常", wind_code, wind_indictor_str, date_2_str(date_from_sub), date_2_str(date_to_sub), params_sub) if exp.ret_dic.setdefault('error_code', 0) in ( -40520007, # 没有可用数据 -40521009, # 数据解码失败。检查输入参数是否正确,如:日期参数注意大小月月末及短二月 ): continue else: raise exp if data_df is None: logger.warning('%s %s [%s ~ %s] has no data', wind_code, wind_indictor_str, date_2_str(date_from_sub), date_2_str(date_to_sub)) continue data_df.dropna(inplace=True) if data_df.shape[0] == 0: # logger.warning('%s %s [%s ~ %s] has 0 data', # wind_code, wind_indictor_str, date_2_str(date_from_sub), date_2_str(date_to_sub)) continue data_df_list.append(data_df) # 合并数据 data_df_tot = pd.concat(data_df_list) if len(data_df_list) > 0 else None return data_df_tot
def get_sectorconstituent(index_code, index_name, target_date) -> pd.DataFrame: """ 通过 wind 获取指数成分股及权重 :param index_code: :param index_name: :param target_date: :return: """ target_date_str = date_2_str(target_date) logger.info('获取 %s %s %s 板块信息', index_code, index_name, target_date) sec_df = invoker.wset( "indexconstituent", "date=%s;windcode=%s" % (target_date_str, index_code)) if sec_df is not None and sec_df.shape[0] > 0: # 发现部分情况下返回数据的日期与 target_date 日期不匹配 sec_df = sec_df[sec_df['date'].apply( lambda x: str_2_date(x) == target_date)] if sec_df is None or sec_df.shape[0] == 0: return None sec_df["index_code"] = index_code sec_df["index_name"] = index_name sec_df.rename(columns={ 'date': 'trade_date', 'sec_name': 'stock_name', 'i_weight': 'weight', }, inplace=True) return sec_df
def default(self, obj): # print("obj.__class__", obj.__class__, "isinstance(obj.__class__, DeclarativeMeta)", isinstance(obj.__class__, DeclarativeMeta)) if isinstance(obj.__class__, DeclarativeMeta): # an SQLAlchemy class fields = {} for field in [x for x in dir(obj) if not x.startswith('_') and x != 'metadata']: data = obj.__getattribute__(field) try: json.dumps(data) # this will fail on non-encodable values, like other classes fields[field] = data except TypeError: # 添加了对datetime的处理 print(data) if isinstance(data, datetime): fields[field] = data.isoformat() elif isinstance(data, date): fields[field] = data.isoformat() elif isinstance(data, timedelta): fields[field] = (datetime.min + data).time().isoformat() else: fields[field] = None # a json-encodable dict return fields elif isinstance(obj, date): return json.dumps(date_2_str(obj)) return json.JSONEncoder.default(self, obj)
def import_coin_info(): """获取全球交易币基本信息""" table_name = 'tushare_coin_info' has_table = engine_md.has_table(table_name) # 设置 dtype dtype = { 'coin': String(60), 'en_name': String(60), 'cn_name': String(60), 'issue_date': Date, 'amount': DOUBLE, } coinlist_df = pro.coinlist(start_date='20170101', end_date=date_2_str(date.today(), DATE_FORMAT_STR)) data_count = bunch_insert_on_duplicate_update(coinlist_df, table_name, engine_md, dtype) logging.info("更新 %s 完成 新增数据 %d 条", table_name, data_count) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) create_pk_str = """ALTER TABLE {table_name} CHANGE COLUMN `coin` `coin` VARCHAR(60) NOT NULL FIRST, CHANGE COLUMN `en_name` `en_name` VARCHAR(60) NOT NULL AFTER `coin`, ADD PRIMARY KEY (`coin`, `en_name`)""".format( table_name=table_name) with with_db_session(engine_md) as session: session.execute(create_pk_str)
def stat_fund(date_from, date_to): sql_str = """ SELECT (@rowNum:=@rowNum+1) AS rowNo, t.* FROM ( SELECT date_from.ts_code, basic.name, basic.management, date_from.date_from, nav_to.end_date, nav_from.accum_nav nav_from, nav_to.accum_nav nav_to, nav_to.accum_nav/nav_from.accum_nav pct_chg FROM ( SELECT ts_code, max(end_date) date_from FROM tushare_fund_nav WHERE end_date<= :date_from GROUP BY ts_code ) date_from JOIN ( SELECT ts_code, max(end_date) date_to FROM tushare_fund_nav WHERE end_date<= :date_to GROUP BY ts_code ) date_to ON date_from.ts_code = date_to.ts_code JOIN tushare_fund_nav nav_from ON date_from.ts_code = nav_from.ts_code AND date_from.date_from = nav_from.end_date JOIN tushare_fund_nav nav_to ON date_to.ts_code = nav_to.ts_code AND date_to.date_to = nav_to.end_date JOIN tushare_fund_basic basic ON date_from.ts_code = basic.ts_code WHERE basic.name NOT LIKE '%B%' and basic.name NOT LIKE '%A%' and basic.name NOT LIKE '%C%' HAVING nav_to.accum_nav IS NOT NULL AND nav_from.accum_nav IS NOT NULL and pct_chg != 1 and pct_chg < 2 ORDER BY nav_to.accum_nav/nav_from.accum_nav ) t""" # data_df = pd.read_sql(sql_str, engine_md) with with_db_session(engine_md) as session: session.execute("Select (@rowNum :=0) ;") table = session.execute(sql_str, params={ 'date_from': date_2_str(date_from), 'date_to': date_2_str(date_to) }) data = [[d for d in row] for row in table.fetchall()] data_df = pd.DataFrame(data, columns=[ 'rowNo', 'ts_code', 'name', 'management', 'date_from', 'date_to', 'nav_from', 'nav_to', 'pct_chg' ]) return data_df.describe()['pct_chg']
def get_sectorconstituent(sector_code, sector_name, target_date) -> pd.DataFrame: """ 通过 wind 获取板块成分股 :param sector_code: :param sector_name: :param target_date: :return: """ target_date_str = date_2_str(target_date) logger.info('获取 %s %s %s 板块信息', sector_code, sector_name, target_date) sec_df = invoker.wset("sectorconstituent", "date=%s;sectorid=%s" % (target_date_str, sector_code)) sec_df["sector_code"] = sector_code sec_df["sector_name"] = sector_name sec_df.rename(columns={ 'date': 'trade_date', 'sec_name': 'stock_name', }, inplace=True) return sec_df
def invoke_index_weight(index_code, trade_date): trade_date = date_2_str(trade_date, STR_FORMAT_DATE_TS) invoke_index_weight = pro.index_weight(index_code=index_code, trade_date=trade_date) return invoke_index_weight
def update_private_fund_nav(chain_param=None, get_df=False, wind_code_list=None): """ :param chain_param: 在celery 中將前面結果做爲參數傳給後面的任務 :param get_df: :param wind_code_list: :return: """ table_name = 'wind_fund_nav' # 初始化数据下载端口 # 初始化数据库engine # 链接数据库,并获取fundnav旧表 # with get_db_session(engine) as session: # table = session.execute('select wind_code, ADDDATE(max(trade_date),1) from wind_fund_nav group by wind_code') # fund_trade_date_begin_dic = dict(table.fetchall()) # 获取wind_fund_info表信息 has_table = engine_md.has_table(table_name) if has_table: fund_info_df = pd.read_sql_query( """SELECT DISTINCT fi.wind_code AS wind_code, IFNULL(trade_date_from, if(trade_date_latest BETWEEN '1900-01-01' AND ADDDATE(CURDATE(), -1), ADDDATE(trade_date_latest,1) , fund_setupdate) ) date_from, if(fund_maturitydate BETWEEN '1900-01-01' AND ADDDATE(CURDATE(), -1),fund_maturitydate,ADDDATE(CURDATE(), -1)) date_to FROM fund_info fi LEFT JOIN ( SELECT wind_code, ADDDATE(max(trade_date),1) trade_date_from FROM wind_fund_nav GROUP BY wind_code ) wfn ON fi.wind_code = wfn.wind_code""", engine_md) else: logger.warning('wind_fund_nav 不存在,仅使用 fund_info 表进行计算日期范围') fund_info_df = pd.read_sql_query("""SELECT DISTINCT fi.wind_code AS wind_code, fund_setupdate date_from, if(fund_maturitydate BETWEEN '1900-01-01' AND ADDDATE(CURDATE(), -1),fund_maturitydate,ADDDATE(CURDATE(), -1)) date_to FROM fund_info fi ORDER BY wind_code""", engine_md) wind_code_date_frm_to_dic = {wind_code: (str_2_date(date_from), str_2_date(date_to)) for wind_code, date_from, date_to in zip(fund_info_df['wind_code'], fund_info_df['date_from'], fund_info_df['date_to'])} fund_info_df.set_index('wind_code', inplace=True) if wind_code_list is None: wind_code_list = list(fund_info_df.index) else: wind_code_list = list(set(wind_code_list) & set(fund_info_df.index)) # 结束时间 date_last_day = date.today() - timedelta(days=1) # date_end_str = date_end.strftime(STR_FORMAT_DATE) fund_nav_all_df = [] no_data_count = 0 code_count = len(wind_code_list) # 对每个新获取的基金名称进行判断,若存在 fundnav 中,则只获取部分净值 wind_code_trade_date_latest_dic = {} date_gap = timedelta(days=10) try: for num, wind_code in enumerate(wind_code_list): date_begin, date_end = wind_code_date_frm_to_dic[wind_code] # if date_end > date_last_day: # date_end = date_last_day if date_begin > date_end: continue # 设定数据获取的起始日期 # wind_code_trade_date_latest_dic[wind_code] = date_to # if wind_code in fund_trade_date_begin_dic: # trade_latest = fund_trade_date_begin_dic[wind_code] # if trade_latest > date_end: # continue # date_begin = max([date_begin, trade_latest]) # if date_begin is None: # continue # elif isinstance(date_begin, str): # date_begin = datetime.strptime(date_begin, STR_FORMAT_DATE).date() if isinstance(date_begin, date): if date_begin.year < 1900: continue if date_begin > date_end: continue date_begin_str = date_begin.strftime('%Y-%m-%d') else: logger.error("%s date_begin:%s", wind_code, date_begin) continue if isinstance(date_end, date): if date_begin.year < 1900: continue if date_begin > date_end: continue date_end_str = date_end.strftime('%Y-%m-%d') else: logger.error("%s date_end:%s", wind_code, date_end) continue # 尝试获取 fund_nav 数据 for k in range(2): try: fund_nav_tmp_df = invoker.wsd(codes=wind_code, fields='nav,NAV_acc,NAV_date', beginTime=date_2_str(date_begin_str), endTime=date_2_str(date_end_str), options='Fill=Previous') trade_date_latest = datetime.strptime(date_end_str, '%Y-%m-%d').date() - date_gap wind_code_trade_date_latest_dic[wind_code] = trade_date_latest break except APIError as exp: # -40520007z if exp.ret_dic.setdefault('error_code', 0) == -40520007: trade_date_latest = datetime.strptime(date_end_str, '%Y-%m-%d').date() - date_gap wind_code_trade_date_latest_dic[wind_code] = trade_date_latest logger.error("%s Failed, ErrorMsg: %s" % (wind_code, str(exp))) continue except Exception as exp: logger.error("%s Failed, ErrorMsg: %s" % (wind_code, str(exp))) continue else: fund_nav_tmp_df = None if fund_nav_tmp_df is None: logger.info('%s No data', wind_code) # del wind_code_trade_date_latest_dic[wind_code] no_data_count += 1 logger.warning('%d funds no data', no_data_count) else: fund_nav_tmp_df.dropna(how='all', inplace=True) df_len = fund_nav_tmp_df.shape[0] if df_len == 0: continue fund_nav_tmp_df['wind_code'] = wind_code # 此处删除 trade_date_latest 之后再加上,主要是为了避免因抛出异常而导致的该条数据也被记录更新 # del wind_code_trade_date_latest_dic[wind_code] trade_date_latest = fund_nav_df_2_sql(table_name, fund_nav_tmp_df, engine_md, is_append=True) if trade_date_latest is None: logger.error('%s[%d] data insert failed', wind_code) else: wind_code_trade_date_latest_dic[wind_code] = trade_date_latest logger.info('%d) %s updated, %d funds left', num, wind_code, code_count - num) if get_df: fund_nav_all_df = fund_nav_all_df.append(fund_nav_tmp_df) if DEBUG and num > 4: # 调试使用 break finally: # import_wind_fund_nav_to_fund_nav() # # update_trade_date_latest(wind_code_trade_date_latest_dic) # try: # # update_fund_mgrcomp_info() # except: # # 新功能上线前由于数据库表不存在,可能导致更新失败,属于正常现象 logger.exception('新功能上线前由于数据库表不存在,可能导致更新失败,属于正常现象') if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) build_primary_key([table_name]) return fund_nav_all_df
def merge_tushare_stock_daily(ths_code_set: set = None, date_from=None): """A股行情数据、财务信息 合并成为到 日级别数据""" table_name = 'tushare_stock_daily' logging.info("合成 %s 开始", table_name) has_table = engine_md.has_table(table_name) if date_from is None and has_table: sql_str = "select adddate(max(`time`),1) from {table_name}".format( table_name=table_name) with with_db_session(engine_md) as session: date_from = date_2_str(session.execute(sql_str).scalar()) # 获取日级别数据 # TODO: 增加 ths_code_set 参数 daily_df, dtype_daily = get_tushare_daily_merged_df( ths_code_set, date_from) daily_df_g = daily_df.groupby('ts_code') ths_code_set_4_daily = set(daily_df_g.size().index) # 获取合并后的财务数据 ifind_fin_df, dtype_fin = get_tushre_merge_stock_fin_df() # 整理 dtype dtype = dtype_daily.copy() dtype.update(dtype_fin) logging.debug("提取财务数据完成") # 计算 财报披露时间 report_date_dic_dic = {} for num, ((ths_code, report_date), data_df) in enumerate( ifind_fin_df.groupby(['ts_code', 'f_ann_date']), start=1): if ths_code_set is not None and ths_code not in ths_code_set: continue if is_nan_or_none(report_date): continue report_date_dic = report_date_dic_dic.setdefault(ths_code, {}) if report_date not in report_date_dic_dic: if data_df.shape[0] > 0: report_date_dic[report_date] = data_df.iloc[0] logger.debug("计算财报日期完成") # 整理 data_df 数据 tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len( report_date_dic_dic) try: for num, (ths_code, report_date_dic) in enumerate(report_date_dic_dic.items(), start=1): # key:ths_code # TODO: 檢查判斷 ths_code 是否存在在ifind_fin_df_g 裏面,,size暫時使用 以後在驚醒改進 if ths_code not in ths_code_set_4_daily: logger.error('fin 表中不存在 %s 的財務數據', ths_code) continue daily_df_cur_ts_code = daily_df_g.get_group(ths_code) logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ths_code, daily_df_cur_ts_code.shape[0]) report_date_list = list(report_date_dic.keys()) report_date_list.sort() report_date_list_len = len(report_date_list) for num_sub, (report_date_from, report_date_to) in enumerate( generate_range(report_date_list)): logger.debug('%d/%d) %d/%d) 处理 %s [%s - %s]', num, for_count, num_sub, report_date_list_len, ths_code, date_2_str(report_date_from), date_2_str(report_date_to)) # 计算有效的日期范围 if report_date_from is None: is_fit = daily_df_cur_ts_code['trade_date'] < report_date_to elif report_date_to is None: is_fit = daily_df_cur_ts_code[ 'trade_date'] >= report_date_from else: is_fit = (daily_df_cur_ts_code['trade_date'] < report_date_to) & ( daily_df_cur_ts_code['trade_date'] >= report_date_from) # 获取日期范围内的数据 ifind_his_ds_df_segment = daily_df_cur_ts_code[is_fit].copy() segment_count = ifind_his_ds_df_segment.shape[0] if segment_count == 0: continue fin_s = report_date_dic[ report_date_from] if report_date_from is not None else None for key in dtype_fin.keys(): if key in ('ts_code', 'trade_date'): continue ifind_his_ds_df_segment[key] = fin_s[ key] if fin_s is not None and key in fin_s else None ifind_his_ds_df_segment['report_date'] = report_date_from # 添加数据到列表 data_df_list.append(ifind_his_ds_df_segment) data_count += segment_count if DEBUG and len(data_df_list) > 1: break # 保存数据库 if data_count > 10000: # 保存到数据库 data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count data_count, data_df_list = 0, [] finally: # 保存到数据库 if len(data_df_list) > 0: data_df = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df, table_name, engine_md, dtype) tot_data_count += data_count logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) build_primary_key([table_name])
def import_index_daily_ds(chain_param=None, ths_code_set: set = None, begin_time=None): """ 通过date_serise接口将历史数据保存到 ifind_index_daily_ds,该数据作为 History数据的补充数据 例如:复权因子af、涨跌停标识、停牌状态、原因等 :param chain_param: 该参数仅用于 task.chain 串行操作时,上下传递参数使用 :param ths_code_set: :param begin_time: :return: """ table_name = 'ifind_index_daily_ds' has_table = engine_md.has_table(table_name) json_indicator, json_param = unzip_join( [(key, val) for key, val, _ in INDICATOR_PARAM_LIST_INDEX_DAILY_DS], sep=';') if has_table: sql_str = """SELECT ths_code, date_frm, if(NULL<end_date, NULL, end_date) date_to FROM ( SELECT info.ths_code, ifnull(trade_date_max_1, ths_index_base_period_index) date_frm, NULL, if(hour(now())<16, subdate(curdate(),1), curdate()) end_date FROM ifind_index_info info LEFT OUTER JOIN (SELECT ths_code, adddate(max(time),1) trade_date_max_1 FROM {table_name} GROUP BY ths_code) daily ON info.ths_code = daily.ths_code ) tt WHERE date_frm <= if(NULL<end_date, NULL, end_date) ORDER BY ths_code""".format(table_name=table_name) else: sql_str = """SELECT ths_code, date_frm, if(NULL<end_date, NULL, end_date) date_to FROM ( SELECT info.ths_code, ths_index_base_period_index date_frm, NULL, if(hour(now())<16, subdate(curdate(),1), curdate()) end_date FROM ifind_index_info info ) tt WHERE date_frm <= if(NULL<end_date, NULL, end_date) ORDER BY ths_code;""" logger.warning('%s 不存在,仅使用 ifind_index_info 表进行计算日期范围' % table_name) with with_db_session(engine_md) as session: # 获取每只股票需要获取日线数据的日期区间 table = session.execute(sql_str) # 获取每只股票需要获取日线数据的日期区间 code_date_range_dic = { ths_code: (date_from if begin_time is None else min([date_from, begin_time]), date_to) for ths_code, date_from, date_to in table.fetchall() if ths_code_set is None or ths_code in ths_code_set } if TRIAL: date_from_min = date.today() - timedelta(days=(365 * 5)) # 试用账号只能获取近5年数据 code_date_range_dic = { ths_code: (max([date_from, date_from_min]), date_to) for ths_code, (date_from, date_to) in code_date_range_dic.items() if date_to is not None and date_from_min <= date_to } data_df_list, data_count, tot_data_count, code_count = [], 0, 0, len( code_date_range_dic) try: for num, (ths_code, (begin_time, end_time)) in enumerate(code_date_range_dic.items(), start=1): logger.debug('%d/%d) %s [%s - %s]', num, code_count, ths_code, begin_time, end_time) end_time = date_2_str(end_time) data_df = invoker.THS_DateSerial( ths_code, json_indicator, json_param, 'Days:Tradedays,Fill:Previous,Interval:D', begin_time, end_time) if data_df is not None and data_df.shape[0] > 0: data_count += data_df.shape[0] data_df_list.append(data_df) # 大于阀值有开始插入 if data_count >= 10000: data_df_all = pd.concat(data_df_list) # data_df_all.to_sql(table_name, engine_md, if_exists='append', index=False, dtype=dtype) data_count = bunch_insert_on_duplicate_update( data_df_all, table_name, engine_md, DTYPE_INDEX_DAILY_DS) tot_data_count += data_count data_df_list, data_count = [], 0 # 仅调试使用 if DEBUG and len(data_df_list) > 1: break finally: if data_count > 0: data_df_all = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update( data_df_all, table_name, engine_md, DTYPE_INDEX_DAILY_DS) tot_data_count += data_count if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) build_primary_key([table_name]) logging.info("更新 %s 完成 新增数据 %d 条", table_name, tot_data_count)
def import_jq_stock_income(chain_param=None, ts_code_set=None): """ 插入股票日线数据到最近一个工作日-1。 如果超过 BASE_LINE_HOUR 时间,则获取当日的数据 :return: """ dtype = { "id": Integer, "company_id": Integer, "company_name": String(100), "code": String(12), "a_code": String(12), "b_code": String(12), "h_code": String(12), "pub_date": Date, "start_date": Date, "end_date": Date, "report_date": Date, "report_type": Integer, "source_id": Integer, "source": String(60), "total_operating_revenue": DOUBLE, "operating_revenue": DOUBLE, "total_operating_cost": DOUBLE, "operating_cost": DOUBLE, "operating_tax_surcharges": DOUBLE, "sale_expense": DOUBLE, "administration_expense": DOUBLE, "exploration_expense": DOUBLE, "financial_expense": DOUBLE, "asset_impairment_loss": DOUBLE, "fair_value_variable_income": DOUBLE, "investment_income": DOUBLE, "invest_income_associates": DOUBLE, "exchange_income": DOUBLE, "other_items_influenced_income": DOUBLE, "operating_profit": DOUBLE, "subsidy_income": DOUBLE, "non_operating_revenue": DOUBLE, "non_operating_expense": DOUBLE, "disposal_loss_non_current_liability": DOUBLE, "other_items_influenced_profit": DOUBLE, "total_profit": DOUBLE, "income_tax": DOUBLE, "other_items_influenced_net_profit": DOUBLE, "net_profit": DOUBLE, "np_parent_company_owners": DOUBLE, "minority_profit": DOUBLE, "eps": DOUBLE, "basic_eps": DOUBLE, "diluted_eps": DOUBLE, "other_composite_income": DOUBLE, "total_composite_income": DOUBLE, "ci_parent_company_owners": DOUBLE, "ci_minority_owners": DOUBLE, "interest_income": DOUBLE, "premiums_earned": DOUBLE, "commission_income": DOUBLE, "interest_expense": DOUBLE, "commission_expense": DOUBLE, "refunded_premiums": DOUBLE, "net_pay_insurance_claims": DOUBLE, "withdraw_insurance_contract_reserve": DOUBLE, "policy_dividend_payout": DOUBLE, "reinsurance_cost": DOUBLE, "non_current_asset_disposed": DOUBLE, "other_earnings": DOUBLE, } table_name = 'jq_stock_income' logger.info("更新 %s 开始", table_name) has_table = engine_md.has_table(table_name) # 判断表是否已经存在 if has_table: sql_str = f"""select max(pub_date) from {table_name}""" date_start = execute_scalar(engine_md, sql_str) logger.info('查询 %s 数据使用起始日期 %s', table_name, date_2_str(date_start)) else: date_start = BASE_DATE logger.warning('%s 不存在,使用基础日期 %s', table_name, date_2_str(date_start)) # 查询最新的 pub_date date_end = date.today() if date_start >= date_end: logger.info('%s 已经是最新数据,无需进一步获取') return data_count_tot = 0 try: for num, (df, date_from, date_to) in enumerate( get_df_iter(date_start, date_end, LOOP_STEP)): # logger.debug('%d) [%s ~ %s] 包含 %d 条数据', num, date_from, date_to, df.shape[0]) data_count = bunch_insert_on_duplicate_update( df, table_name, engine_md, dtype=dtype, myisam_if_create_table=True, primary_keys=['id'], schema=config.DB_SCHEMA_MD) data_count_tot += data_count finally: # 导入数据库 logging.info("更新 %s 结束 %d 条信息被更新", table_name, data_count_tot)
def import_index_daily(chain_param=None): """导入指数数据 :param chain_param: 在celery 中將前面結果做爲參數傳給後面的任務 :return: """ table_name = "wind_index_daily" has_table = engine_md.has_table(table_name) col_name_param_list = [ ('open', DOUBLE), ('high', DOUBLE), ('low', DOUBLE), ('close', DOUBLE), ('volume', DOUBLE), ('amt', DOUBLE), ('turn', DOUBLE), ('free_turn', DOUBLE), ] wind_indictor_str = ",".join([key for key, _ in col_name_param_list]) rename_col_dic = {key.upper(): key.lower() for key, _ in col_name_param_list} dtype = {key: val for key, val in col_name_param_list} dtype['wind_code'] = String(20) # TODO: 'trade_date' 声明为 Date 类型后,插入数据库会报错,目前原因不详,日后再解决 # dtype['trade_date'] = Date, # yesterday = date.today() - timedelta(days=1) # date_ending = date.today() - ONE_DAY if datetime.now().hour < BASE_LINE_HOUR else date.today() # sql_str = """select wii.wind_code, wii.sec_name, ifnull(adddate(latest_date, INTERVAL 1 DAY), wii.basedate) date_from # from wind_index_info wii left join # ( # select wind_code,index_name, max(trade_date) as latest_date # from wind_index_daily group by wind_code # ) daily # on wii.wind_code=daily.wind_code""" # with with_db_session(engine_md) as session: # table = session.execute(sql_str) # wind_code_date_from_dic = {wind_code: (sec_name, date_from) for wind_code, sec_name, date_from in table.fetchall()} # with with_db_session(engine_md) as session: # # 获取市场有效交易日数据 # sql_str = "select trade_date from wind_trade_date where trade_date > '2005-1-1'" # table = session.execute(sql_str) # trade_date_sorted_list = [t[0] for t in table.fetchall()] # trade_date_sorted_list.sort() # date_to = get_last(trade_date_sorted_list, lambda x: x <= date_ending) # data_len = len(wind_code_date_from_dic) if has_table: sql_str = """ SELECT wind_code, date_frm, if(null<end_date, null, end_date) date_to FROM ( SELECT info.wind_code, ifnull(trade_date, basedate) date_frm, null, if(hour(now())<16, subdate(curdate(),1), curdate()) end_date FROM wind_index_info info LEFT OUTER JOIN (SELECT wind_code, adddate(max(trade_date),1) trade_date FROM {table_name} GROUP BY wind_code) daily ON info.wind_code = daily.wind_code ) tt WHERE date_frm <= if(null<end_date, null, end_date) ORDER BY wind_code""".format(table_name=table_name) else: logger.warning('%s 不存在,仅使用 wind_index_info 表进行计算日期范围', table_name) sql_str = """ SELECT wind_code, date_frm, if(null<end_date, null, end_date) date_to FROM ( SELECT info.wind_code, basedate date_frm, null, if(hour(now())<16, subdate(curdate(),1), curdate()) end_date FROM wind_index_info info ) tt WHERE date_frm <= if(null<end_date, null, end_date) ORDER BY wind_code;""" with with_db_session(engine_md) as session: # 获取每只股票需要获取日线数据的日期区间 table = session.execute(sql_str) # 获取每只股票需要获取日线数据的日期区间 begin_time = None wind_code_date_from_dic = { wind_code: (date_from if begin_time is None else min([date_from, begin_time]), date_to) for wind_code, date_from, date_to in table.fetchall() if wind_code_set is None or wind_code in wind_code_set} data_len = len(wind_code_date_from_dic) logger.info('%d indexes will been import', data_len) for data_num, (wind_code, (date_from, date_to)) in enumerate(wind_code_date_from_dic.items()): if str_2_date(date_from) > date_to: logger.warning("%d/%d) %s %s - %s 跳过", data_num, data_len, wind_code, date_from, date_to) continue try: temp = invoker.wsd(wind_code, wind_indictor_str, date_from, date_to) except APIError as exp: logger.exception("%d/%d) %s 执行异常", data_num, data_len, wind_code) if exp.ret_dic.setdefault('error_code', 0) in ( -40520007, # 没有可用数据 -40521009, # 数据解码失败。检查输入参数是否正确,如:日期参数注意大小月月末及短二月 ): continue else: break temp.reset_index(inplace=True) temp.rename(columns={'index': 'trade_date'}, inplace=True) temp.rename(columns=rename_col_dic, inplace=True) temp.trade_date = temp.trade_date.apply(str_2_date) temp['wind_code'] = wind_code bunch_insert_on_duplicate_update(temp, table_name, engine_md, dtype=dtype) logger.info('更新指数 %s 至 %s 成功', wind_code, date_2_str(date_to)) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) build_primary_key([table_name])
def import_coin_daily(chain_param=None, id_set=None, begin_time=None): """插入历史数据到 cmc_coin_v1_daily 试用 v1 接口,该接口可能在2018年12月底到期""" table_name = "cmc_coin_v1_daily" info_table_name = "cmc_coin_v1_info" logging.info("更新 %s 开始", table_name) has_table = engine_md.has_table(table_name) if has_table: sql_str = """ SELECT id, symbol, date_frm, if(delist_date<end_date, delist_date, end_date) date_to FROM ( SELECT info.id, symbol, ifnull(trade_date,date('2013-04-28')) date_frm, null delist_date, if(hour(now())<16, subdate(curdate(),1), curdate()) end_date FROM {info_table_name} info LEFT OUTER JOIN (SELECT id, adddate(max(date),1) trade_date FROM {table_name} GROUP BY id) daily ON info.id = daily.id ) tt WHERE date_frm <= if(delist_date<end_date, delist_date, end_date) ORDER BY id""".format(table_name=table_name, info_table_name=info_table_name) else: logger.warning('%s 不存在,仅使用 %s 表进行计算日期范围', table_name, info_table_name) sql_str = """ SELECT id, symbol, date_frm, if(delist_date<end_date, delist_date, end_date) date_to FROM ( SELECT id, symbol, null date_frm, null delist_date, if(hour(now())<16, subdate(curdate(),1), curdate()) end_date FROM {info_table_name} info ) tt ORDER BY id""".format(info_table_name=info_table_name) with with_db_session(engine_md) as session: # 获取每只股票需要获取日线数据的日期区间 table = session.execute(sql_str) # 获取每只股票需要获取日线数据的日期区间 stock_date_dic = { (coin_id, symbol): (date_from if begin_time is None else min([date_from, begin_time]), date_to) for coin_id, symbol, date_from, date_to in table.fetchall() if id_set is None or coin_id in id_set } # 设置 dtype dtype = { 'id': String(60), 'date': Date, 'open': DOUBLE, 'high': DOUBLE, 'low': DOUBLE, 'close': DOUBLE, 'volume': DOUBLE, 'market_cap': DOUBLE, } col_names = dtype.keys() data_df_list = [] dic_count = len(stock_date_dic) data_count = 0 # 获取接口数据 logger.info('%d coins will been import into %s', dic_count, table_name) try: for data_num, ((coin_id, symbol), (date_from, date_to)) in enumerate(stock_date_dic.items(), start=1): logger.debug('%d/%d) %s[%s] [%s - %s]', data_num, dic_count, coin_id, symbol, date_from, date_to) date_from_str = None try: if date_from is None: scraper = CmcScraperV1(symbol, coin_id) else: date_from_str = date_2_str( str_2_date(date_from, DATE_FORMAT_STR), DATE_FORMAT_STR_CMC) scraper = CmcScraperV1(symbol, coin_id, start_date=date_from_str) data_df = scraper.get_dataframe() except Exception as exp: logger.exception("scraper('%s', '%s', start_date='%s')", symbol, coin_id, date_from_str) continue if data_df is None or data_df.shape[0] == 0: logger.warning('%d/%d) %s has no data during %s %s', data_num, dic_count, coin_id, date_from, date_to) continue data_df.rename(columns={ col_name: rename_by_dic(col_name, col_names) for col_name in data_df.columns }, inplace=True) data_df.rename(columns={'market cap': 'market_cap'}, inplace=True) data_df['market_cap'] = data_df['market_cap'].apply( lambda x: 0 if isinstance(x, str) else x) data_df['volume'] = data_df['volume'].apply( lambda x: 0 if isinstance(x, str) else x) logger.info('%d/%d) %d data of %s between %s and %s', data_num, dic_count, data_df.shape[0], coin_id, data_df['date'].min(), data_df['date'].max()) data_df['id'] = coin_id data_df_list.append(data_df) data_count += data_df.shape[0] # 仅供调试使用 if DEBUG and len(data_df_list) > 10: break if data_count > 10000: data_df_all = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update(data_df_all, table_name, engine_md, dtype=dtype) logging.info("%s %d 条信息被更新", table_name, data_count) data_df_list, data_count = [], 0 finally: # 导入数据库 创建 if len(data_df_list) > 0: data_df_all = pd.concat(data_df_list) data_count = bunch_insert_on_duplicate_update(data_df_all, table_name, engine_md, dtype=dtype) logging.info("更新 %s 结束 %d 条信息被更新", table_name, data_count) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) # build_primary_key([table_name]) create_pk_str = """ALTER TABLE {table_name} CHANGE COLUMN `id` `id` VARCHAR(60) NOT NULL FIRST , CHANGE COLUMN `date` `date` DATE NOT NULL AFTER `id`, ADD PRIMARY KEY (`id`, `date`)""".format(table_name=table_name) with with_db_session(engine_md) as session: session.execute(create_pk_str)
def import_coinbar_on_freq_min(freq, code_set=None, base_begin_time=None): """ 抓取 日级别以上数据[ 60min, 30min, 15min, 5min, 1min ]级别 :param freq: :param code_set: :param base_begin_time: :return: """ if base_begin_time is not None and not isinstance(base_begin_time, date): base_begin_time = str_2_date(base_begin_time) table_name = 'tushare_coin_md_' + freq info_table_name = 'tushare_coin_pair_info' has_table = engine_md.has_table(table_name) if has_table: sql_str = """SELECT exchange, exchange_pair, date_frm, if(delist_date<end_date, delist_date, end_date) date_to FROM ( SELECT info.exchange, info.exchange_pair, ifnull(trade_date_max_1, adddate(trade_date_latest,1)) date_frm, delist_date, if(hour(now())<8, subdate(curdate(),2), subdate(curdate(),1)) end_date FROM ( select exchange, exchange_pair, ifnull(trade_date_latest_{freq},'2010-01-01') trade_date_latest, delist_date_{freq} delist_date from {info_table_name} ) info LEFT OUTER JOIN (SELECT exchange, symbol, adddate(max(`date`),1) trade_date_max_1 FROM {table_name} GROUP BY exchange, symbol) daily ON info.exchange = daily.exchange AND info.exchange_pair = daily.symbol ) tt WHERE date_frm <= if(delist_date<end_date, delist_date, end_date) ORDER BY exchange, exchange_pair""".format( table_name=table_name, info_table_name=info_table_name, freq=freq) else: sql_str = """SELECT exchange, exchange_pair, date_frm, if(delist_date<end_date, delist_date, end_date) date_to FROM ( SELECT exchange, exchange_pair, ifnull(trade_date_latest_{freq},date('2010-01-01')) date_frm, delist_date_{freq} delist_date, if(hour(now())<8, subdate(curdate(),2), subdate(curdate(),1)) end_date FROM {info_table_name} info ORDER BY exchange, exchange_pair ) tt WHERE date_frm <= if(delist_date<end_date, delist_date, end_date) ORDER BY exchange, exchange_pair""".format( info_table_name=info_table_name, freq=freq) logger.warning('%s 不存在,仅使用 %s 表进行计算日期范围', table_name, info_table_name) with with_db_session(engine_md) as session: # 获取每只股票需要获取日线数据的日期区间 table = session.execute(sql_str) # 获取每只股票需要获取日线数据的日期区间 code_date_range_dic = { (exchange, symbol): (date_from if base_begin_time is None else min( [date_from, base_begin_time]), date_to) for exchange, symbol, date_from, date_to in table.fetchall() if code_set is None or (exchange, symbol) in code_set } # 设置 dtype dtype = { 'exchange': String(60), 'symbol': String(60), 'date': Date, 'datetime': DateTime, 'open': DOUBLE, 'high': DOUBLE, 'low': DOUBLE, 'close': DOUBLE, 'vol': DOUBLE, } # 更新 info 表 trade_date_latest 字段 trade_date_latest_list = [] update_trade_date_latest_str = """UPDATE tushare_coin_pair_info info SET info.trade_date_latest_daily = :trade_date_latest WHERE info.exchange = :exchange AND exchange_pair=:exchange_pair""" data_df_list, data_count, tot_data_count, code_count = [], 0, 0, len( code_date_range_dic) try: for num, ((exchange, exchange_pair), (begin_time, end_time)) in enumerate(code_date_range_dic.items(), start=1): begin_time_str = date_2_str(begin_time, DATE_FORMAT_STR) end_time_str = date_2_str(end_time, DATE_FORMAT_STR) logger.debug('%d/%d) %s %s [%s - %s]', num, code_count, exchange, exchange_pair, begin_time, end_time) try: # data_df = pro.coinbar(exchange='huobi', symbol='gxsbtc', freq='1min', start_date='20180701', end_date='20180801') data_df = pro.coinbar(exchange=exchange, symbol=exchange_pair, freq=freq, start_date=begin_time_str, end_date=end_time_str) except Exception as exp: if len(exp.args) >= 1 and exp.args[0] == '系统内部错误': trade_date_latest_list.append({ 'exchange': exchange, 'exchange_pair': exchange_pair, 'trade_date_latest': '2020-02-02', }) logger.warning( "coinbar(exchange='%s', symbol='%s', freq='%s', start_date='%s', end_date='%s') 系统内部错误", exchange, exchange_pair, freq, begin_time_str, end_time_str) continue logger.exception( "coinbar(exchange='%s', symbol='%s', freq='%s', start_date='%s', end_date='%s')", exchange, exchange_pair, freq, begin_time_str, end_time_str) raise exp from exp if data_df is not None and data_df.shape[0] > 0: data_count += data_df.shape[0] data_df['exchange'] = exchange data_df['datetime'] = data_df['date'] data_df['date'] = data_df['date'].apply( lambda x: str_2_datetime(x).date()) data_df_list.append(data_df) # 记录最新交易日变化 trade_date_latest_list.append({ 'exchange': exchange, 'exchange_pair': exchange_pair, 'trade_date_latest': end_time_str, }) # 大于阀值有开始插入 if data_count >= 10000: data_df_all = pd.concat(data_df_list) # data_df_all.to_sql(table_name, engine_md, if_exists='append', index=False, dtype=dtype) data_count = bunch_insert_on_duplicate_update( data_df_all, table_name, engine_md, dtype) tot_data_count += data_count data_df_list, data_count = [], 0 # 更新 info 表 trade_date_latest 字段 with with_db_session(engine_md) as session: result = session.execute(update_trade_date_latest_str, params=trade_date_latest_list) update_count = result.rowcount session.commit() logger.info('更新 %d 条交易对的最新交易 %s 信息', update_count, freq) trade_date_latest_list = [] # 仅调试使用 if DEBUG and len(data_df_list) > 1: break finally: if data_count > 0: data_df_all = pd.concat(data_df_list) # data_df_all.to_sql(table_name, engine_md, if_exists='append', index=False, dtype=dtype) data_count = bunch_insert_on_duplicate_update( data_df_all, table_name, engine_md, dtype) tot_data_count += data_count # 更新 info 表 trade_date_latest 字段 if len(trade_date_latest_list) > 0: with with_db_session(engine_md) as session: result = session.execute(update_trade_date_latest_str, params=trade_date_latest_list) update_count = result.rowcount session.commit() logger.info('更新 %d 条交易对的最新交易日信息', update_count) if not has_table and engine_md.has_table(table_name): alter_table_2_myisam(engine_md, [table_name]) # build_primary_key([table_name]) create_pk_str = """ALTER TABLE {table_name} CHANGE COLUMN `exchange` `exchange` VARCHAR(60) NOT NULL FIRST, CHANGE COLUMN `symbol` `symbol` VARCHAR(60) NOT NULL AFTER `exchange`, CHANGE COLUMN `datetime` `datetime` DATETIME NOT NULL AFTER `symbol`, ADD PRIMARY KEY (`exchange`, `symbol`, `datetime`)""".format( table_name=table_name) with with_db_session(engine_md) as session: session.execute(create_pk_str) logging.info("更新 %s 完成 新增数据 %d 条", table_name, tot_data_count)