Example #1
0
def merge_ifind_stock_daily(ths_code_set: set = None, date_from=None):
    """将ds his 以及财务数据合并为 daily 数据"""
    table_name = 'ifind_stock_daily'
    logging.info("合成 %s 开始", table_name)
    has_table = engine_md.has_table(table_name)
    if date_from is None and has_table:
        sql_str = "select adddate(max(`time`),1) from {table_name}".format(
            table_name=table_name)
        with with_db_session(engine_md) as session:
            date_from = date_2_str(session.execute(sql_str).scalar())
    # 獲取各個表格數據
    ifind_his_df = get_ifind_daily_df('ifind_stock_daily_his', date_from)
    ifind_ds_df = get_ifind_daily_df('ifind_stock_daily_ds', date_from)
    ifind_report_date_df = get_ifind_report_date_df('ifind_stock_report_date',
                                                    None)
    ifind_fin_df = get_ifind_daily_df('ifind_stock_fin', None)
    ifind_fin_df_g = ifind_fin_df.groupby('ths_code')
    ths_code_set_4_daily = set(ifind_fin_df_g.size().index)
    # 合并 ds his 数据
    ifind_his_ds_df = pd.merge(ifind_his_df,
                               ifind_ds_df,
                               how='outer',
                               on=['ths_code', 'time'])  # 拼接後續有nan,無數據
    ifind_his_ds_df_g = ifind_his_ds_df.groupby('ths_code')
    logger.debug("提取数据完成")
    # 计算 财报披露时间
    report_date_dic_dic = {}
    for report_date_g in [
            ifind_report_date_df.groupby(
                ['ths_code', 'ths_regular_report_actual_dd_stock'])
    ]:
        for num, ((ths_code, report_date), data_df) in enumerate(report_date_g,
                                                                 start=1):
            if ths_code_set is not None and ths_code not in ths_code_set:
                continue
            if is_nan_or_none(report_date):
                continue
            report_date_dic = report_date_dic_dic.setdefault(ths_code, {})
            if ths_code not in ths_code_set_4_daily:
                logger.error('fin 表中不存在 %s 的財務數據', ths_code)
                continue
            ifind_fin_df_temp = ifind_fin_df_g.get_group(ths_code)
            if report_date not in report_date_dic_dic:
                ifind_fin_df_temp = ifind_fin_df_temp[
                    ifind_fin_df_temp['time'] <= report_date]
                if ifind_fin_df_temp.shape[0] > 0:
                    report_date_dic[
                        report_date] = ifind_fin_df_temp.sort_values(
                            'time').iloc[0]

    # # 设置 dtype
    dtype = {'report_date': Date}
    for dic in [
            DTYPE_STOCK_DAILY_DS, DTYPE_STOCK_REPORT_DATE,
            DTYPE_STOCK_DAILY_FIN, DTYPE_STOCK_DAILY_HIS
    ]:
        for key, val in dic.items():
            dtype[key] = val

    logger.debug("计算财报日期完成")
    # 整理 data_df 数据
    tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len(
        report_date_dic_dic)
    try:
        for num, (ths_code,
                  report_date_dic) in enumerate(report_date_dic_dic.items(),
                                                start=1):  # key:ths_code
            # TODO: 檢查判斷 ths_code 是否存在在ifind_fin_df_g 裏面,,size暫時使用  以後在驚醒改進
            if ths_code not in ifind_his_ds_df_g.size():
                logger.error('fin 表中不存在 %s 的財務數據', ths_code)
                continue
            # open low  等 is NAN 2438
            ifind_his_ds_df_cur_ths_code = ifind_his_ds_df_g.get_group(
                ths_code)  # shape[1] 30
            logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ths_code,
                         ifind_his_ds_df_cur_ths_code.shape[0])
            report_date_list = list(report_date_dic.keys())
            report_date_list.sort()
            for report_date_from, report_date_to in iter_2_range(
                    report_date_list):
                logger.debug('%d/%d) 处理 %s [%s - %s]', num, for_count,
                             ths_code, date_2_str(report_date_from),
                             date_2_str(report_date_to))
                # 计算有效的日期范围
                if report_date_from is None:
                    is_fit = ifind_his_ds_df_cur_ths_code[
                        'time'] < report_date_to
                elif report_date_to is None:
                    is_fit = ifind_his_ds_df_cur_ths_code[
                        'time'] >= report_date_from
                else:
                    is_fit = (ifind_his_ds_df_cur_ths_code['time'] <
                              report_date_to) & (
                                  ifind_his_ds_df_cur_ths_code['time'] >=
                                  report_date_from)
                # 获取日期范围内的数据
                ifind_his_ds_df_segment = ifind_his_ds_df_cur_ths_code[
                    is_fit].copy()
                segment_count = ifind_his_ds_df_segment.shape[0]
                if segment_count == 0:
                    continue
                fin_s = report_date_dic[
                    report_date_from] if report_date_from is not None else None
                for key in DTYPE_STOCK_DAILY_FIN.keys():
                    if key in ('ths_code', 'time'):
                        continue
                    ifind_his_ds_df_segment[key] = fin_s[
                        key] if fin_s is not None and key in fin_s else None
                ifind_his_ds_df_segment['report_date'] = report_date_from
                # 添加数据到列表
                data_df_list.append(ifind_his_ds_df_segment)
                data_count += segment_count

            if DEBUG and len(data_df_list) > 1:
                break

            # 保存数据库
            if data_count > 10000:
                # 保存到数据库
                data_df = pd.concat(data_df_list)
                data_count = bunch_insert_on_duplicate_update(
                    data_df, table_name, engine_md, dtype)
                tot_data_count += data_count
                data_count, data_df_list = 0, []

    finally:
        # 保存到数据库
        if len(data_df_list) > 0:
            data_df = pd.concat(data_df_list)
            data_count = bunch_insert_on_duplicate_update(
                data_df, table_name, engine_md, dtype)
            tot_data_count += data_count

        logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count)
        if not has_table and engine_md.has_table(table_name):
            alter_table_2_myisam(engine_md, [table_name])
            build_primary_key([table_name])
Example #2
0
def merge_daily_balancesheet(ts_code_set: set=None):
    """
    合並 數據日表與 季度balancesheet 表格數據
    :param data_from:
    :return:
    """
    table_name = 'tushare_merge_daily_bala'
    data_balancesheet_df = get_tushare_daily_df('tushare_stock_balancesheet', None)
    tushare_daily_df = get_tushare_daily_df('tushare_stock_daily', None)
    # data_merge_two_df = pd.merge(data_balancesheet_df, tushare_daily_df)
    tushare_daily_df_g = tushare_daily_df.groupby('ts_code')
    data_balancesheet_df_g = data_balancesheet_df.groupby('ts_code')
    report_date_dic_dic = {}

    for report_date_g in [data_balancesheet_df.groupby(['ts_code', "f_ann_date"])]:  # one date one date('ocde'.'open')
        for num, ((ts_code, report_date), data_df) in enumerate(report_date_g, start=1):
            if is_nan_or_none(report_date):
                continue
            if ts_code_set is not None and ts_code not in ts_code_set:
                continue
            report_date_dic = report_date_dic_dic.setdefault(ts_code, {})
            temp = data_balancesheet_df_g.get_group(ts_code)
            report_date_dic[report_date] = temp.sort_values('f_ann_date').iloc[0]

    # report_date = data_balancesheet_df.groupby("f_ann_date").count().index
    # ts_code = [ts_code for ts_code in data_balancesheet_df.groupby("f_ann_date").count()['ts_code']]
    # # 设置 dtype
    dtype = {}
    for dic in [DTYPE_TUSHARE_SUSPEND, DTYPE_TUSHARE_DAILY_BASIC, DTYPE_TUSHARE_DAILY]:
        for key, val in dic.items():
            dtype[key] = val

    logging.debug('數據整理')
    tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len(report_date_dic_dic)
    try:
        for num, (ts_code, report_date_dic) in enumerate(report_date_dic_dic.items(), start=1):  # key:ts_code
            # open low  等 is NAN 2438
            if ts_code not in tushare_daily_df_g.size():
                continue
            tushare_his_ds_df_cur_ts_code = tushare_daily_df_g.get_group(ts_code)  # shape[1] 30
            logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ts_code, tushare_his_ds_df_cur_ts_code.shape[0])
            report_date_list = list(report_date_dic.keys())
            report_date_list.sort()
            for report_date_from, report_date_to in generate_range(report_date_list):
                logger.debug('%d/%d) 处理 %s [%s - %s]',
                             num, for_count, ts_code, date_2_str(report_date_from), date_2_str(report_date_to))
                if report_date_from is None:
                    is_fit = tushare_his_ds_df_cur_ts_code['trade_date'] < report_date_to
                elif report_date_to is None:
                    is_fit = tushare_his_ds_df_cur_ts_code['trade_date'] >= report_date_from
                else:
                    is_fit = (tushare_his_ds_df_cur_ts_code['trade_date'] < report_date_to) & (
                            tushare_his_ds_df_cur_ts_code['trade_date'] >= report_date_from)
                    # 获取日期范围内的数据
                ifind_his_ds_df_segment = tushare_his_ds_df_cur_ts_code[is_fit].copy()
                segment_count = ifind_his_ds_df_segment.shape[0]
                if segment_count == 0:
                    continue
                fin_s = report_date_dic[report_date_from] if report_date_from is not None else None
                for key in DTYPE_TUSHARE_STOCK_BALABCESHEET.keys():
                    if key in ('ths_code', 'time'):
                        continue
                    ifind_his_ds_df_segment[key] = fin_s[key] if fin_s is not None and key in fin_s else None
                ifind_his_ds_df_segment['report_date'] = report_date_from
                # 添加数据到列表
                data_df_list.append(ifind_his_ds_df_segment)

                data_count += segment_count
                if report_date_from is '2018-08-16':
                    break

            if DEBUG and len(data_df_list) > 1:
                break

                # 保存数据库
            if data_count > 10000:
                # 保存到数据库
                data_df = pd.concat(data_df_list)
                data_count = bunch_insert_on_duplicate_update(data_df, table_name, engine_md, dtype)
                tot_data_count += data_count
                data_count, data_df_list = 0, []

    finally:
        # 保存到数据库
        if len(data_df_list) > 0:
            data_df = pd.concat(data_df_list)
            data_count = bunch_insert_on_duplicate_update(data_df, table_name, engine_md, dtype)
            tot_data_count += data_count
        #
        logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count)
def merge_tushare_daily(ths_code_set: set = None, date_from=None):
    table_name = 'tushare_daily'
    logging.info("合成 %s 开始", table_name)
    has_table = engine_md.has_table(table_name)
    if date_from is None and has_table:
        sql_str = "select adddate(max(`trade_date`),1) from {table_name}".format(
            table_name=table_name)
        with with_db_session(engine_md) as session:
            date_from = date_2_str(session.execute(sql_str).scalar())
    # 獲取各個表格數據
    # daily
    tushare_his_df = get_tushare_daily_df('tushare_stock_daily', date_from)
    tushare_ds_df = get_tushare_daily_df('tushare_daily_basic', date_from)
    tushare_suspend_df = get_tushare_daily_df('tushare_suspend', None)
    # quarterly

    tushare_merge_df = get_tushare_daily_df('tushare_stock_cashflow', None)

    # tushare_balancesheet_df = get_tushare_daily_df('tushare_stock_balancesheet', None)
    # tushare_icome_df = get_tushare_daily_df('tushare_stock_income', None)
    # tushare_indicator_df = get_tushare_daily_df('tushare_stock_fina_indicator', None)
    # tushare_merge_quarterly_df = merge_tushare_quarterly(None)
    # #
    # tushare_merge_df_one = pd.merge(tushare_cashflow_df, tushare_balancesheet_df,
    #                                 how='outer', on=['ts_code', 'end_date'])
    # tushare_merge_df_two = pd.merge(tushare_merge_df_one, tushare_icome_df,
    #                                 how='outer', on=['ts_code', 'end_date'])
    # tushare_merge_df = pd.merge(tushare_merge_df_two, tushare_indicator_df,
    #                             how='outer', on=['ts_code', 'end_date'])

    tushare_his_dis_df = pd.merge(tushare_his_df,
                                  tushare_ds_df,
                                  how='outer',
                                  on=[
                                      'ts_code',
                                      'trade_date',
                                  ])
    tushare_his_ds_df = pd.merge(tushare_his_dis_df,
                                 tushare_suspend_df,
                                 how='outer',
                                 on='ts_code')

    tushare_merge_df_g = tushare_merge_df.groupby('ts_code')
    tushare_his_ds_df_g = tushare_his_ds_df.groupby('ts_code')
    logging.debug('提取数据完成')
    merge_date_dic_dic = {}
    # # 計算財務紕漏事件
    for suspend_date_g in [tushare_merge_df.groupby(['ts_code', 'ann_date'])]:
        for num, ((ts_code, ann_date), date_df) in enumerate(suspend_date_g):
            if ths_code_set is not None and ts_code not in ths_code_set:
                continue
            if is_nan_or_none(ann_date):
                continue
            suspend_date_dic = merge_date_dic_dic.setdefault(ts_code, {})
            if ts_code not in tushare_merge_df_g.size():
                logger.error('hebing 表中不存在 %s 的財務數據', ts_code)
                continue
            tushare_merge_df_temp = tushare_merge_df_g.get_group(ts_code)
            if ann_date not in merge_date_dic_dic:
                tushare_merge_df_temp = tushare_merge_df_temp[
                    tushare_merge_df_temp['f_ann_date'] <= ann_date]
                if tushare_merge_df_temp.shape[0] > 0:
                    suspend_date_dic[
                        ann_date] = tushare_merge_df_temp.sort_values(
                            'f_ann_date').iloc[0]
    # # 設置dtype
    dtype = {'ann_date': Date}

    for dic in [
            DTYPE_TUSHARE_STOCK_DAILY_MD,
            DTYPE_TUSHARE_STOCK_DAILY_BASIC,
            DTYPE_TUSHARE_SUSPEND,
            # DTYPE_TUSHARE_STOCK_INCOME,
            # DTYPE_TUSHARE_STOCK_BALABCESHEET,
            DTYPE_TUSHARE_CASHFLOW
    ]:
        for key, val in dic.items():  # len(dic)12
            dtype[key] = val
    logging.debug("计算财报日期完成")

    # 整理 data_df 数据
    tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len(
        merge_date_dic_dic)
    try:
        for num, (ts_code, report_date_dic) in enumerate(
                merge_date_dic_dic.items(),
                start=1):  # key:ts_code nan 較多  列明: nan
            # TODO: size暫時使用  以後在驚醒改進
            if ts_code not in tushare_his_ds_df_g.size():
                logger.error('suspend 表中不存在 %s 的財務數據', ts_code)
                continue
            tushare_his_ds_df_cur_ths_code = tushare_his_ds_df_g.get_group(
                ts_code)  # shape[1] 30
            logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ts_code,
                         tushare_his_ds_df_cur_ths_code.shape[0])
            report_date_list = list(suspend_date_dic.keys())
            report_date_list.sort()
            for report_date_from, report_date_to in generate_range(
                    report_date_list):
                logger.debug('%d/%d) 处理 %s [%s - %s]', num, for_count, ts_code,
                             date_2_str(report_date_from),
                             date_2_str(report_date_to))
                # 计算有效的日期范围
                if report_date_from is None:
                    is_fit = tushare_his_ds_df_cur_ths_code[
                        'trade_date'] < report_date_to
                elif report_date_to is None:
                    is_fit = tushare_his_ds_df_cur_ths_code[
                        'trade_date'] >= report_date_from
                else:
                    is_fit = (tushare_his_ds_df_cur_ths_code['trade_date'] <
                              report_date_to) & (
                                  tushare_his_ds_df_cur_ths_code['trade_date']
                                  >= report_date_from)
                # 获取日期范围内的数据
                tushare_his_ds_df_segment = tushare_his_ds_df_cur_ths_code[
                    is_fit].copy()
                segment_count = tushare_his_ds_df_segment.shape[0]
                if segment_count == 0:
                    continue
                fin_s = report_date_dic[
                    report_date_from] if report_date_from is not None else None
                #################################################
                for key in (
                        DTYPE_TUSHARE_STOCK_DAILY_MD.keys()
                        and DTYPE_TUSHARE_STOCK_DAILY_BASIC.keys()
                        and DTYPE_TUSHARE_SUSPEND.keys()
                        and DTYPE_TUSHARE_CASHFLOW.keys()
                        # and
                        #  DTYPE_TUSHARE_STOCK_BALABCESHEET.keys() and DTYPE_TUSHARE_STOCK_INCOME.keys()
                ):
                    if key in ('ts_code', 'trade_date'):
                        continue
                    tushare_his_ds_df_segment[key] = fin_s[
                        key] if fin_s is not None and key in fin_s else None
                tushare_his_ds_df_segment['ann_date'] = report_date_from
                # 添加数据到列表
                data_df_list.append(tushare_his_ds_df_segment)
                data_count += segment_count

            if DEBUG and len(data_df_list) > 1:
                break

            # 保存数据库
            if data_count > 10000:
                # 保存到数据库
                data_df = pd.concat(data_df_list)
                data_count = bunch_insert_on_duplicate_update(
                    data_df, table_name, engine_md, dtype)
                tot_data_count += data_count
                data_count, data_df_list = 0, []

    finally:
        # 保存到数据库   report_date
        if len(data_df_list) > 0:
            data_df = pd.concat(data_df_list)
            data_count = bunch_insert_on_duplicate_update(
                data_df, table_name, engine_md, dtype)
            tot_data_count += data_count

        logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count)
        if not has_table and engine_md.has_table(table_name):
            alter_table_2_myisam(engine_md, [table_name])
            build_primary_key([table_name])
Example #4
0
def merge_tushare_stock_daily(ths_code_set: set = None, date_from=None):
    """A股行情数据、财务信息 合并成为到 日级别数据"""
    table_name = 'tushare_stock_daily'
    logging.info("合成 %s 开始", table_name)
    has_table = engine_md.has_table(table_name)
    if date_from is None and has_table:
        sql_str = "select adddate(max(`time`),1) from {table_name}".format(
            table_name=table_name)
        with with_db_session(engine_md) as session:
            date_from = date_2_str(session.execute(sql_str).scalar())

    # 获取日级别数据
    # TODO: 增加 ths_code_set 参数
    daily_df, dtype_daily = get_tushare_daily_merged_df(
        ths_code_set, date_from)

    daily_df_g = daily_df.groupby('ts_code')
    ths_code_set_4_daily = set(daily_df_g.size().index)

    # 获取合并后的财务数据
    ifind_fin_df, dtype_fin = get_tushre_merge_stock_fin_df()

    # 整理 dtype
    dtype = dtype_daily.copy()
    dtype.update(dtype_fin)
    logging.debug("提取财务数据完成")
    # 计算 财报披露时间
    report_date_dic_dic = {}
    for num, ((ths_code, report_date), data_df) in enumerate(
            ifind_fin_df.groupby(['ts_code', 'f_ann_date']), start=1):
        if ths_code_set is not None and ths_code not in ths_code_set:
            continue
        if is_nan_or_none(report_date):
            continue
        report_date_dic = report_date_dic_dic.setdefault(ths_code, {})
        if report_date not in report_date_dic_dic:
            if data_df.shape[0] > 0:
                report_date_dic[report_date] = data_df.iloc[0]

    logger.debug("计算财报日期完成")
    # 整理 data_df 数据
    tot_data_count, data_count, data_df_list, for_count = 0, 0, [], len(
        report_date_dic_dic)
    try:
        for num, (ths_code,
                  report_date_dic) in enumerate(report_date_dic_dic.items(),
                                                start=1):  # key:ths_code
            # TODO: 檢查判斷 ths_code 是否存在在ifind_fin_df_g 裏面,,size暫時使用  以後在驚醒改進
            if ths_code not in ths_code_set_4_daily:
                logger.error('fin 表中不存在 %s 的財務數據', ths_code)
                continue

            daily_df_cur_ts_code = daily_df_g.get_group(ths_code)
            logger.debug('%d/%d) 处理 %s %d 条数据', num, for_count, ths_code,
                         daily_df_cur_ts_code.shape[0])
            report_date_list = list(report_date_dic.keys())
            report_date_list.sort()
            report_date_list_len = len(report_date_list)
            for num_sub, (report_date_from, report_date_to) in enumerate(
                    generate_range(report_date_list)):
                logger.debug('%d/%d) %d/%d) 处理 %s [%s - %s]', num, for_count,
                             num_sub, report_date_list_len, ths_code,
                             date_2_str(report_date_from),
                             date_2_str(report_date_to))
                # 计算有效的日期范围
                if report_date_from is None:
                    is_fit = daily_df_cur_ts_code['trade_date'] < report_date_to
                elif report_date_to is None:
                    is_fit = daily_df_cur_ts_code[
                        'trade_date'] >= report_date_from
                else:
                    is_fit = (daily_df_cur_ts_code['trade_date'] <
                              report_date_to) & (
                                  daily_df_cur_ts_code['trade_date'] >=
                                  report_date_from)
                # 获取日期范围内的数据
                ifind_his_ds_df_segment = daily_df_cur_ts_code[is_fit].copy()
                segment_count = ifind_his_ds_df_segment.shape[0]
                if segment_count == 0:
                    continue
                fin_s = report_date_dic[
                    report_date_from] if report_date_from is not None else None
                for key in dtype_fin.keys():
                    if key in ('ts_code', 'trade_date'):
                        continue
                    ifind_his_ds_df_segment[key] = fin_s[
                        key] if fin_s is not None and key in fin_s else None

                ifind_his_ds_df_segment['report_date'] = report_date_from
                # 添加数据到列表
                data_df_list.append(ifind_his_ds_df_segment)
                data_count += segment_count

            if DEBUG and len(data_df_list) > 1:
                break

            # 保存数据库
            if data_count > 10000:
                # 保存到数据库
                data_df = pd.concat(data_df_list)
                data_count = bunch_insert_on_duplicate_update(
                    data_df, table_name, engine_md, dtype)
                tot_data_count += data_count
                data_count, data_df_list = 0, []

    finally:
        # 保存到数据库
        if len(data_df_list) > 0:
            data_df = pd.concat(data_df_list)
            data_count = bunch_insert_on_duplicate_update(
                data_df, table_name, engine_md, dtype)
            tot_data_count += data_count

        logger.info('%s 新增或更新记录 %d 条', table_name, tot_data_count)
        if not has_table and engine_md.has_table(table_name):
            alter_table_2_myisam(engine_md, [table_name])
            build_primary_key([table_name])