Beispiel #1
0
def get_df_iter(date_start, date_end, step, df_len_limit=3000, deep=0):
    """
    获取日期范围内的数据,当数据记录大于上限条数时,将日期范围进行二分法拆分,迭代进行查询
    :param date_start:
    :param date_end:
    :param step:
    :param df_len_limit:
    :param deep:
    :return:
    """
    for num, (date_from, date_to) in enumerate(iter_2_range(range_date(
            date_start, date_end, step), has_left_outer=False, has_right_outer=False), start=1):
        q = query(finance.STK_INCOME_STATEMENT).filter(
            finance.STK_INCOME_STATEMENT.pub_date > date_2_str(date_from),
            finance.STK_INCOME_STATEMENT.pub_date <= date_2_str(date_to))

        df = finance.run_query(q)
        df_len = df.shape[0]
        if df_len >= df_len_limit:
            if step >= 2:
                logger.warning('%s%d) [%s ~ %s] 包含 %d 条数据,可能已经超越 %d 条提取上限,开始进一步分割日期',
                               '  ' * deep, num, date_from, date_to, df_len, df_len_limit)
                yield from get_df_iter(date_from, date_to, step // 2, deep=deep + 1)
            else:
                logger.warning('%s%d) [%s ~ %s] 包含 %d 条数据,可能已经超越 %d 条提取上限且无法再次分割日期范围,手动需要补充提取剩余数据',
                               '  ' * deep, num, date_from, date_to, df_len, df_len_limit)
                yield df, date_from, date_to
        else:
            logger.debug('%s%d) [%s ~ %s] 包含 %d 条数据', '  ' * deep, num, date_from, date_to, df_len)
            yield df, date_from, date_to
Beispiel #2
0
def merge_ifind_stock_daily(ths_code_set: set = None, date_from=None):
    """将ds his 以及财务数据合并为 daily 数据"""
    table_name = 'ifind_stock_daily'
    logging.info("合成 %s 开始", table_name)
    has_table = engine_md.has_table(table_name)
    if date_from is None and has_table:
        sql_str = "select adddate(max(`time`),1) from {table_name}".format(
            table_name=table_name)
        with with_db_session(engine_md) as session:
            date_from = date_2_str(session.execute(sql_str).scalar())
    # 獲取各個表格數據
    ifind_his_df = get_ifind_daily_df('ifind_stock_daily_his', date_from)
    ifind_ds_df = get_ifind_daily_df('ifind_stock_daily_ds', date_from)
    ifind_report_date_df = get_ifind_report_date_df('ifind_stock_report_date',
                                                    None)
    ifind_fin_df = get_ifind_daily_df('ifind_stock_fin', None)
    ifind_fin_df_g = ifind_fin_df.groupby('ths_code')
    ths_code_set_4_daily = set(ifind_fin_df_g.size().index)
    # 合并 ds his 数据
    ifind_his_ds_df = pd.merge(ifind_his_df,
                               ifind_ds_df,
                               how='outer',
                               on=['ths_code', 'time'])  # 拼接後續有nan,無數據
    ifind_his_ds_df_g = ifind_his_ds_df.groupby('ths_code')
    logger.debug("提取数据完成")
    # 计算 财报披露时间
    report_date_dic_dic = {}
    for report_date_g in [
            ifind_report_date_df.groupby(
                ['ths_code', 'ths_regular_report_actual_dd_stock'])
    ]:
        for num, ((ths_code, report_date), data_df) in enumerate(report_date_g,
                                                                 start=1):
            if ths_code_set is not None and ths_code not in ths_code_set:
                continue
            if is_nan_or_none(report_date):
                continue
            report_date_dic = report_date_dic_dic.setdefault(ths_code, {})
            if ths_code not in ths_code_set_4_daily:
                logger.error('fin 表中不存在 %s 的財務數據', ths_code)
                continue
            ifind_fin_df_temp = ifind_fin_df_g.get_group(ths_code)
            if report_date not in report_date_dic_dic:
                ifind_fin_df_temp = ifind_fin_df_temp[
                    ifind_fin_df_temp['time'] <= report_date]
                if ifind_fin_df_temp.shape[0] > 0:
                    report_date_dic[
                        report_date] = ifind_fin_df_temp.sort_values(
                            'time').iloc[0]

    # # 设置 dtype
    dtype = {'report_date': Date}
    for dic in [
            DTYPE_STOCK_DAILY_DS, DTYPE_STOCK_REPORT_DATE,
            DTYPE_STOCK_DAILY_FIN, DTYPE_STOCK_DAILY_HIS
    ]:
        for key, val in dic.items():
            dtype[key] = val

    logger.debug("计算财报日期完成")
    # 整理 data_df 数据
    tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len(
        report_date_dic_dic)
    try:
        for num, (ths_code,
                  report_date_dic) in enumerate(report_date_dic_dic.items(),
                                                start=1):  # key:ths_code
            # TODO: 檢查判斷 ths_code 是否存在在ifind_fin_df_g 裏面,,size暫時使用  以後在驚醒改進
            if ths_code not in ifind_his_ds_df_g.size():
                logger.error('fin 表中不存在 %s 的財務數據', ths_code)
                continue
            # open low  等 is NAN 2438
            ifind_his_ds_df_cur_ths_code = ifind_his_ds_df_g.get_group(
                ths_code)  # shape[1] 30
            logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ths_code,
                         ifind_his_ds_df_cur_ths_code.shape[0])
            report_date_list = list(report_date_dic.keys())
            report_date_list.sort()
            for report_date_from, report_date_to in iter_2_range(
                    report_date_list):
                logger.debug('%d/%d) 处理 %s [%s - %s]', num, for_count,
                             ths_code, date_2_str(report_date_from),
                             date_2_str(report_date_to))
                # 计算有效的日期范围
                if report_date_from is None:
                    is_fit = ifind_his_ds_df_cur_ths_code[
                        'time'] < report_date_to
                elif report_date_to is None:
                    is_fit = ifind_his_ds_df_cur_ths_code[
                        'time'] >= report_date_from
                else:
                    is_fit = (ifind_his_ds_df_cur_ths_code['time'] <
                              report_date_to) & (
                                  ifind_his_ds_df_cur_ths_code['time'] >=
                                  report_date_from)
                # 获取日期范围内的数据
                ifind_his_ds_df_segment = ifind_his_ds_df_cur_ths_code[
                    is_fit].copy()
                segment_count = ifind_his_ds_df_segment.shape[0]
                if segment_count == 0:
                    continue
                fin_s = report_date_dic[
                    report_date_from] if report_date_from is not None else None
                for key in DTYPE_STOCK_DAILY_FIN.keys():
                    if key in ('ths_code', 'time'):
                        continue
                    ifind_his_ds_df_segment[key] = fin_s[
                        key] if fin_s is not None and key in fin_s else None
                ifind_his_ds_df_segment['report_date'] = report_date_from
                # 添加数据到列表
                data_df_list.append(ifind_his_ds_df_segment)
                data_count += segment_count

            if DEBUG and len(data_df_list) > 1:
                break

            # 保存数据库
            if data_count > 10000:
                # 保存到数据库
                data_df = pd.concat(data_df_list)
                data_count = bunch_insert_on_duplicate_update(
                    data_df, table_name, engine_md, dtype)
                tot_data_count += data_count
                data_count, data_df_list = 0, []

    finally:
        # 保存到数据库
        if len(data_df_list) > 0:
            data_df = pd.concat(data_df_list)
            data_count = bunch_insert_on_duplicate_update(
                data_df, table_name, engine_md, dtype)
            tot_data_count += data_count

        logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count)
        if not has_table and engine_md.has_table(table_name):
            alter_table_2_myisam(engine_md, [table_name])
            build_primary_key([table_name])
def plot_industry_classified_mid(col_name='ev2_to_ebitda'):
    # sql_str = """select sector_code, sector_name,base.trade_date, sum(ev2_to_ebitda) tot_val
    #     from (
    #         SELECT * FROM fof_ams_dev.wind_sectorconstituent where sector_name like 'cs%%'
    #     ) base
    #     LEFT JOIN
    #     (
    #     select trade_date, wind_code, ev2_to_ebitda from wind_stock_daily where ev2_to_ebitda is not null
    #     ) val
    #     on base.trade_date = val.trade_date
    #     and base.wind_code = val.wind_code
    #     group by sector_code, base.trade_date
    #     having tot_val is not null"""
    # TODO: 待行业数据下载齐全后可生成相应的分布图
    sector_sql_str = """SELECT sector_name, trade_date, wind_code FROM fof_ams_dev.wind_sectorconstituent 
        where sector_name like 'cs%'"""
    with with_db_session(engine_md) as session:
        table = session.execute(sector_sql_str)
        sector_trade_date_wind_code_list_dic = defaultdict(dict)
        num = 0
        for num, (sector_name, trade_date, wind_code) in enumerate(table.fetchall(), start=1):
            if sector_name not in sector_trade_date_wind_code_list_dic:
                sector_trade_date_wind_code_list_dic[sector_name] = {
                    'trade_date_set': set(),
                    'trade_date_wind_code_list_dic': defaultdict(list)
                }
            sector_trade_date_wind_code_list_dic[sector_name]['trade_date_set'].add(trade_date)
            sector_trade_date_wind_code_list_dic[sector_name][
                'trade_date_wind_code_list_dic'][trade_date].append(wind_code)
    sector_count = len(sector_trade_date_wind_code_list_dic)
    logger.debug('获取行业数据 %d 条 %d 个行业', num, sector_count)

    stock_sql_str = f"""select wind_code, trade_date, `{col_name}` from wind_stock_daily 
        where `{col_name}` is not null"""
    data_df = pd.read_sql(stock_sql_str, engine_md)
    logger.debug('获取行情数据 %d 条', data_df.shape[0])
    pivot_df = data_df.pivot(index='trade_date', columns='wind_code', values=col_name).sort_index()
    logger.debug('转换数据 %s', pivot_df.shape)

    sector_trade_date_val_list_dic, sector_trade_date_val_dic = {}, {}
    logger.debug('计算 %d 个行业中位数', sector_count)
    for num, (sector_name, data_dic) in enumerate(sector_trade_date_wind_code_list_dic.items(), start=1):
        trade_date_list = list(data_dic['trade_date_set'])
        trade_date_list.sort()
        trade_date_list_len = len(trade_date_list)
        logger.debug('%d/%d) %s %d 个交易日', num, sector_count, sector_name, trade_date_list_len)
        trade_date_wind_code_list_dic = data_dic['trade_date_wind_code_list_dic']
        # for trade_date, wind_code_list in trade_date_wind_code_list_dic.items():
        for num2, (trade_date_from, trade_date_to) in enumerate(
                iter_2_range(trade_date_list, has_left_outer=False), start=1):
            wind_code_list = trade_date_wind_code_list_dic[trade_date_from]
            # logger.debug('%d/%d) [%d/%d] %s [%s %s)', num, sector_count, num2, trade_date_list_len,
            #              sector_name, trade_date_from, trade_date_to, )
            # 计算中位数
            try:
                tmp_df = pivot_df.loc[trade_date_from:trade_date_to, wind_code_list]
                if tmp_df.shape[0] == 0:
                    continue
            except KeyError:
                continue
            val_s = tmp_df.median(axis=1)
            if trade_date_to is not None:
                # 去除最后一天
                val_s = val_s.iloc[:-1]
            # 保存到dict
            if sector_name not in sector_trade_date_val_list_dic:
                sector_trade_date_val_list_dic[sector_name] = [val_s]
            else:
                sector_trade_date_val_list_dic[sector_name].append(val_s)

        # 合并计算结果成为 一个 Series
        if sector_name in sector_trade_date_val_list_dic and len(sector_trade_date_val_list_dic[sector_name]) > 0:
            logger.debug('%s %d 个交易日合并数据', sector_name, len(trade_date_list))
            sector_trade_date_val_dic[sector_name] = pd.concat(sector_trade_date_val_list_dic[sector_name])

    # 数据合并
    # 将所有 sector 的 数据合并成为 DataFrame
    logger.debug('合并 %d 个行业数据', sector_count)
    data_df = pd.DataFrame(sector_trade_date_val_dic)
    data_df.to_excel('median.xls', legend=False)
    data_df.plot()
    plt.show()
        for key, v in col_merge_rule_dic.items()]
    for _, data_s in data_df.T.items():
        data_list.append({key: handler(data_s) for key, handler in col_handler_list})

    return pd.DataFrame(data_list)


def get_ifind_daily_df(table_name, date_from) -> pd.DataFrame:
    if date_from is None:
        sql_str = "select * from {table_name}".format(table_name=table_name)
        data_df = pd.read_sql(sql_str, engine_md)  # , index_col='ths_code'
    else:
        sql_str = "select * from {table_name} where time >= %s".format(table_name=table_name)
        data_df = pd.read_sql(sql_str, engine_md, params=[date_from])  # , index_col='ths_code'
    return data_df


def get_wind_daily_df(table_name, date_from) -> pd.DataFrame:
    if date_from is None:
        sql_str = "select * from {table_name}".format(table_name=table_name)
        data_df = pd.read_sql(sql_str, engine_md)  # , index_col='ths_code'
    else:
        sql_str = "select * from {table_name} where time >= %s".format(table_name=table_name)
        data_df = pd.read_sql(sql_str, engine_md, params=[date_from])  # , index_col='ths_code'
    return data_df


if __name__ == "__main__":
    for x in iter_2_range([1, 2, 3]):
        print(x)