Beispiel #1
0
    def get_factors_with_labels(self, md_df):
        factors = self.get_factors(md_df)
        price_arr = factors[:, 0]
        self.input_size = factors.shape[1]
        labels = self.calc_label_with_future_value(price_arr, -0.01, 0.01)
        idx_last_available_label = get_last_idx(labels, lambda x: x.sum() == 0)
        factors = factors[:idx_last_available_label + 1, :]
        labels = labels[:idx_last_available_label + 1, :]
        # if self.normalization_model:
        #     factors = (factors - np.mean(factors, 0)) / np.std(factors, 0)

        return factors, labels
Beispiel #2
0
def transfer_report_2_daily(table_name_report: str, table_name_daily: str,
                            table_name_trade_date: str, dtype_daily: dict,
                            accumulation_col_name_list):
    """
    将财务数据(季度)保存成日级别数据
    :param table_name_report:
    :param table_name_daily:
    :param table_name_trade_date:
    :param dtype_daily:
    :param accumulation_col_name_list:
    :return:
    """
    if not engine_md.has_table(table_name_report):
        logger.info('%s 不存在,无需转化成日级别数据', table_name_report)
    today = datetime.date.today()

    has_table = engine_md.has_table(table_name_daily)
    # 获取每只股票最新的交易日,以及截至当期日期的全部交易日数据
    with with_db_session(engine_md) as session:
        sql_str = f"""select trade_date from {table_name_trade_date} where trade_date<=:today order by trade_date"""
        table = session.execute(sql_str, params={"today": today})
        trade_date_list = [_[0] for _ in table.fetchall()]
        if has_table:
            sql_str = f"""select code, max(trade_date) from {table_name_daily} group by code"""
            table = session.execute(sql_str)
            code_date_latest_dic = dict(table.fetchall())
        else:
            code_date_latest_dic = {}

    # 获取季度、半年、年报财务数据
    col_name_list = list(dtype_daily.keys())
    col_name_list_str = ','.join(
        [f'report.`{col_name}` {col_name}' for col_name in col_name_list])
    sql_str = f"""SELECT {col_name_list_str} FROM {table_name_report} report inner join 
        (
            select code, pub_date, max(report_date) report_date 
            from {table_name_report} where report_type=0 group by code, pub_date
        ) base_date
        where report.report_type=0
        and report.code = base_date.code
        and report.pub_date = base_date.pub_date
        and report.report_date = base_date.report_date
        order by code, pub_date"""
    dfg_by_code = pd.read_sql(sql_str, engine_md).set_index(
        'report_date', drop=False).sort_index().groupby('code')
    dfg_len = len(dfg_by_code)
    data_new_s_list = []
    # 按股票代码分组,分别按日进行处理
    for num, (code, df_by_code) in enumerate(dfg_by_code, start=1):
        # df_by_code.sort_index(inplace=True)  # 前面代码以及有此功能
        df_by_code = df_by_code.copy()
        df_len = df_by_code.shape[0]
        # df_by_code.loc[:, ['pub_date_next', 'report_date_next']] = df_by_code[['pub_date', 'report_date']].shift(-1)
        df_by_code.loc[:, 'pub_date_next'] = df_by_code['pub_date'].shift(-1)
        df_by_code.loc[:,
                       'report_date_next'] = df_by_code['report_date'].shift(
                           -1)
        # 将相关周期累加增长字段转化为季度增长字段
        for col_name in accumulation_col_name_list:
            # df_by_code = fill_season_data(df_by_code, 'total_operating_revenue')
            df_by_code, col_name_season = fill_season_data(
                df_by_code, col_name)
            # 更新 dtype_daily
            if col_name_season not in dtype_daily:
                dtype_daily[col_name_season] = dtype_daily[col_name]

        # df_by_code['code'] = code
        trade_date_latest = code_date_latest_dic[
            code] if code in code_date_latest_dic else None
        for num_sub, (report_date, data_s) in enumerate(df_by_code.T.items(),
                                                        start=1):
            pub_date = data_s['pub_date']
            # report_date = data_s['report_date']
            pub_date_next = data_s['pub_date_next']
            # 检查 最新交易日是否已经大于下一条财报日期,如果是则跳过当前数据
            if not pd.isnull(trade_date_latest) and not pd.isnull(
                    pub_date_next) and trade_date_latest > pub_date_next:
                continue
            # 获取 交易日区间
            if pd.isnull(trade_date_latest):
                date_from_idx = get_first_idx(trade_date_list,
                                              lambda x: x >= pub_date)
            else:
                date_from_idx = get_first_idx(trade_date_list,
                                              lambda x: x > trade_date_latest)
            if pd.isnull(pub_date_next):
                date_to_idx = get_last_idx(trade_date_list,
                                           lambda x: x <= today)
            else:
                date_to_idx = get_last_idx(trade_date_list,
                                           lambda x: x < pub_date_next)

            if date_from_idx is None:
                logger.warning(
                    '%s %d/%d) %d/%d) %s 没有找到有效的起始日期 pub_date: %s trade_date_latest: %s ',
                    table_name_report, num, dfg_len, num_sub, df_len, code,
                    pub_date, trade_date_latest)
                continue
            if date_to_idx is None:
                logger.warning(
                    '%s %d/%d) %d/%d) %s 没有找到有效的截至日期 today: %s pub_date_next: %s ',
                    table_name_report, num, dfg_len, num_sub, df_len, code,
                    today, pub_date_next)
                continue
            if date_from_idx > date_to_idx:
                logger.warning(
                    '%s %d/%d) %d/%d) %s %s > %s 不匹配 pub_date: %s trade_date_latest: %s today: %s pub_date_next: %s ',
                    table_name_report, num, dfg_len, num_sub, df_len, code,
                    trade_date_list[date_from_idx],
                    trade_date_list[date_to_idx], pub_date, trade_date_latest,
                    today, pub_date_next)
                continue

            logger.debug('%s %d/%d) %d/%d) %s [%s, %s) 预计转化 %d 条日级别数据,报告日:%s,',
                         table_name_report, num, dfg_len, num_sub, df_len,
                         code, trade_date_list[date_from_idx],
                         trade_date_list[date_to_idx],
                         date_to_idx - date_from_idx + 1, report_date)
            # 补充交易日区间的每日数据
            for trade_date in trade_date_list[date_from_idx:(date_to_idx + 1)]:
                data_new_s = data_s.copy()
                data_new_s['trade_date'] = trade_date
                data_new_s_list.append(data_new_s)

        if len(data_new_s_list) > 0:
            data_count = save_data_2_daily_table(data_new_s_list,
                                                 table_name_daily, dtype_daily)
            logger.info("%s %d/%d) %s %d 条记录被保存", table_name_report, num,
                        dfg_len, code, data_count)
            data_new_s_list = []
def import_data(table_name, dtype, invoke_api,
                primary_keys=["index_symbol", "trade_date", "jq_code"], ts_code_set=None, is_debug=False, is_monthly=False):
    """
    插入股票日线数据到最近一个工作日-1。
    如果超过 BASE_LINE_HOUR 时间,则获取当日的数据
    :return:
    """
    info_table = 'jq_index_info'
    # table_name = 'jq_index_stocks'
    # primary_keys = ["index_symbol", "trade_date", "jq_code"]
    logging.info("更新 %s 开始", table_name)
    has_table = engine_md.has_table(table_name)
    # 进行表格判断,确定是否含有tushare_stock_daily
    if has_table:
        sql_str = f"""
            SELECT jq_code, date_from, if(date_to<end_date, date_to, end_date) date_to
            FROM
            (
            SELECT info.jq_code, ifnull(trade_date, start_date) date_from, end_date date_to,
            if(hour(now())<16, subdate(curdate(),1), curdate()) end_date
            FROM 
                {info_table} info 
            LEFT OUTER JOIN
                (SELECT index_symbol, adddate(max(trade_date),1) trade_date 
                FROM {table_name} GROUP BY index_symbol) daily
            ON info.jq_code = daily.index_symbol
            ) tt
            WHERE date_from <= if(date_to<end_date, date_to, end_date) 
            ORDER BY jq_code"""
    else:
        sql_str = f"""
            SELECT jq_code, date_from, if(date_to<end_date, date_to, end_date) date_to
            FROM
              (
                SELECT info.jq_code, start_date date_from, end_date date_to,
                if(hour(now())<16, subdate(curdate(),1), curdate()) end_date
                FROM {info_table} info 
              ) tt
            WHERE date_from <= if(date_to<end_date, date_to, end_date) 
            ORDER BY jq_code"""
        logger.warning('%s 不存在,仅使用 tushare_stock_info 表进行计算日期范围', table_name)

    sql_trade_date_str = """
       SELECT trade_date FROM jq_trade_date trddate 
       WHERE trade_date <= if(hour(now())<16, subdate(curdate(),1), curdate()) 
       ORDER BY trade_date"""

    with with_db_session(engine_md) as session:
        table = session.execute(sql_trade_date_str)
        trade_date_list = [row[0] for row in table.fetchall()]
        trade_date_list.sort()
        # 获取每只股票需要获取日线数据的日期区间
        table = session.execute(sql_str)
        begin_time = None
        # 获取date_from,date_to,将date_from,date_to做为value值
        code_date_range_dic = {
            ts_code: (date_from if begin_time is None else min([date_from, begin_time]), date_to)
            for ts_code, date_from, date_to in table.fetchall() if
            ts_code_set is None or ts_code in ts_code_set}

    # data_len = len(code_date_range_dic)
    data_df_list, data_count, all_data_count, data_len = [], 0, 0, len(code_date_range_dic)
    logger.info('%d records will been import into %s', data_len, table_name)
    # 将data_df数据,添加到data_df_list

    try:
        for num, (index_symbol, (date_from_tmp, date_to_tmp)) in enumerate(code_date_range_dic.items(), start=1):
            date_from_idx = get_first_idx(trade_date_list, lambda x: x >= date_from_tmp)
            date_to_idx = get_last_idx(trade_date_list, lambda x: x <= date_to_tmp)
            if date_from_idx is None or date_to_idx is None or date_from_idx > date_to_idx:
                logger.debug('%d/%d) %s [%s - %s] 跳过', num, data_len, index_symbol,
                             trade_date_list[date_from_idx] if date_from_idx is not None else None,
                             trade_date_list[date_to_idx] if date_to_idx is not None else None)
                continue
            if is_monthly:
                date_sample = trade_date_list[date_from_idx: (date_to_idx + 1)]
                date_sample = list(
                    pd.Series(date_sample, index=pd.DatetimeIndex(date_sample)
                              ).resample(rule='M', convention='end').last()
                )
            else:
                date_sample = trade_date_list[date_from_idx: (date_to_idx + 1)]

            date_from, date_to = date_sample[0], date_sample[-1]
            trade_date_count = len(date_sample)
            logger.debug('%d/%d) 开始导入 %s [%s - %s] %d 个交易日的数据 %s',
                         num, data_len, index_symbol, date_from, date_to, trade_date_count,
                         '月度更新' if is_monthly else '')
            for trade_date in date_sample:
                data_df = invoke_api(index_symbol=index_symbol, trade_date=trade_date)

                # 把数据攒起来
                if data_df is not None and data_df.shape[0] > 0:
                    data_count += data_df.shape[0]
                    data_df_list.append(data_df)

                # 大于阀值有开始插入
                if data_count >= 1000:
                    data_count = bunch_insert(data_df_list, table_name=table_name, dtype=dtype,
                                              primary_keys=primary_keys)
                    all_data_count += data_count
                    data_df_list, data_count = [], 0

                if is_debug and len(data_df_list) > 1:
                    break
    except:
        logger.exception("%s 获取数据异常", table_name)
    finally:
        # 导入数据库
        if len(data_df_list) > 0:
            data_count = bunch_insert(data_df_list, table_name=table_name, dtype=dtype, primary_keys=primary_keys)
            all_data_count += data_count
            logging.info("更新 %s 结束 %d 条信息被更新", table_name, all_data_count)
def import_sectorconstituent(sector_code,
                             sector_name,
                             date_start,
                             chain_param=None,
                             exch_code='SZSE'):
    """
    导入 sector_code 板块的成分股
    :param sector_code:默认"SZSE":"深圳"
    :param sector_name:
    :param date_start:
    :param chain_param:  在celery 中將前面結果做爲參數傳給後面的任務
    :return:
    """
    # 根据 exch_code 获取交易日列表
    trade_date_list_sorted = get_trade_date_list_sorted(exch_code)
    if trade_date_list_sorted is None or len(trade_date_list_sorted) == 0:
        raise ValueError("没有交易日数据")
    trade_date_list_count = len(trade_date_list_sorted)
    # 格式化 日期字段
    date_start = str_2_date(date_start)

    date_constituent_df_dict = {}
    idx_constituent_set_dic = {}
    # 从数据库中获取最近一个交易日的成分股列表,如果为空,则代表新导入数据 date, constituent_df
    date_latest, constituent_df = get_latest_constituent_df(sector_code)
    # date_constituent_df_dict[date] = constituent_df
    date_latest = str_2_date(date_latest)
    if date_latest is None or date_latest < date_start:
        idx_start = get_last_idx(trade_date_list_sorted,
                                 lambda x: x <= date_start)
        sec_df, _ = get_sectorconstituent_2_dic(sector_code, sector_name,
                                                date_start, idx_start,
                                                trade_date_list_sorted,
                                                date_constituent_df_dict,
                                                idx_constituent_set_dic)
        # 保存板块数据
        sec_df.to_sql("wind_sectorconstituent",
                      engine_md,
                      if_exists='append',
                      index=False)
    else:
        date_start = date_latest
        idx_start = get_last_idx(trade_date_list_sorted,
                                 lambda x: x <= date_start)
        date_constituent_df_dict[date_latest] = constituent_df
        idx_constituent_set_dic[idx_start] = set(constituent_df['wind_code'])

    # 设定日期字段
    # idx_end = idx_start + span if idx_start + span < trade_date_list_count - 1 else trade_date_list_count -1
    yesterday = date.today() - timedelta(days=1)
    idx_end = get_last_idx(trade_date_list_sorted, lambda x: x <= yesterday)
    if idx_start >= idx_end:
        return

    left_or_right = 1
    recursion_get_sectorconstituent(idx_start, idx_end, trade_date_list_sorted,
                                    date_constituent_df_dict,
                                    idx_constituent_set_dic, left_or_right,
                                    sector_code, sector_name)

    # 剔除 date_start 点的数据,该日期数据以及纳入数据库
    del date_constituent_df_dict[date_start]
    # 其他数据导入数据库
    for num, (date_cur, sec_df) in enumerate(date_constituent_df_dict.items(),
                                             start=1):
        sec_df.to_sql("wind_sectorconstituent",
                      engine_md,
                      if_exists='append',
                      index=False)
        logger.info("%d) %s %d 条 %s 成分股数据导入数据库", num, date_cur,
                    sec_df.shape[0], sector_name)
        #仅仅调试时使用
        if DEBUG and num >= 20:
            break
def import_index_constituent(index_code,
                             index_name,
                             date_start,
                             exch_code='SZSE',
                             date_end=None,
                             method='loop'):
    """
    导入 sector_code 板块的成分股
    :param index_code:默认"SZSE":"深圳"
    :param index_name:
    :param date_start:
    :param exch_code:
    :param date_end:默认为None,到最近交易日的历史数据
    :return:
    """
    table_name = 'wind_index_constituent'
    param_list = [
        ('trade_date', Date),
        ('weight', DOUBLE),
        ('stock_name', String(80)),
        ('index_code', String(20)),
        ('index_name', String(80)),
    ]
    #  sldksldDFGDFGD,Nlfkgldfngldldfngldnzncvxcvnx
    dtype = {key: val for key, val in param_list}
    dtype['wind_cod'] = String(20)
    # 根据 exch_code 获取交易日列表
    trade_date_list_sorted = get_trade_date_list_sorted(exch_code)
    if trade_date_list_sorted is None or len(trade_date_list_sorted) == 0:
        raise ValueError("没有交易日数据")
    trade_date_list_count = len(trade_date_list_sorted)
    # 格式化 日期字段
    date_start = str_2_date(date_start)
    if date_end is not None:
        date_end = str_2_date(date_end)
        idx_end = get_first_idx(trade_date_list_sorted,
                                lambda x: x >= date_end)
        if idx_end is not None:
            trade_date_list_sorted = trade_date_list_sorted[:(idx_end + 1)]

    date_constituent_df_dict = OrderedDict()
    idx_constituent_set_dic = {}
    # 从数据库中获取最近一个交易日的成分股列表,如果为空,则代表新导入数据 date, constituent_df
    date_latest, constituent_df = get_latest_constituent_df(index_code)
    # date_constituent_df_dict[date] = constituent_df
    date_latest = str_2_date(date_latest)
    if date_latest is None or date_latest < date_start:
        idx_start = get_last_idx(trade_date_list_sorted,
                                 lambda x: x <= date_start)
        sec_df, _ = get_index_constituent_2_dic(index_code, index_name,
                                                date_start, idx_start,
                                                date_constituent_df_dict,
                                                idx_constituent_set_dic)
        if sec_df is None or sec_df.shape[0] == 0:
            return
        # 保存板块数据
        # sec_df.to_sql(table_name, engine_md, if_exists='append', index=False)
        bunch_insert_on_duplicate_update(sec_df,
                                         table_name,
                                         engine_md,
                                         dtype=dtype)
    else:
        date_start = date_latest
        idx_start = get_last_idx(trade_date_list_sorted,
                                 lambda x: x <= date_start)
        date_constituent_df_dict[date_latest] = constituent_df
        idx_constituent_set_dic[idx_start] = set(constituent_df['wind_code'])

    # 设定日期字段
    # idx_end = idx_start + span if idx_start + span < trade_date_list_count - 1 else trade_date_list_count -1
    yesterday = date.today() - timedelta(days=1)
    idx_end = get_last_idx(trade_date_list_sorted, lambda x: x <= yesterday)
    if idx_start >= idx_end:
        return

    if method == 'loop':
        try:
            idx_end = idx_start + 10  # 调试使用
            loop_get_data(idx_start + 1, idx_end, trade_date_list_sorted,
                          date_constituent_df_dict, idx_constituent_set_dic,
                          index_code, index_name)
        except APIError:
            logger.exception(
                'loop_get_data (idx_start=%d, idx_end=%d, index_code=%s, index_name=%s)',
                idx_start, idx_end, index_code, index_name)
    elif method == 'recursion':
        left_or_right = 1
        recursion_dichotomy_get_data(idx_start, idx_end,
                                     trade_date_list_sorted,
                                     date_constituent_df_dict,
                                     idx_constituent_set_dic, left_or_right,
                                     index_code, index_name)
    else:
        raise ValueError('method = %s error' % method)

    # 剔除 date_start 点的数据,该日期数据以及纳入数据库
    del date_constituent_df_dict[date_start]
    # 其他数据导入数据库
    for num, (date_cur, sec_df) in enumerate(date_constituent_df_dict.items(),
                                             start=1):
        # sec_df.to_sql(table_name, engine_md, if_exists='append', index=False)
        bunch_insert_on_duplicate_update(sec_df,
                                         table_name,
                                         engine_md,
                                         dtype=dtype)
        logger.info("%d) %s %d 条 %s 成分股数据导入数据库", num, date_cur,
                    sec_df.shape[0], index_name)