Esempio n. 1
0
def get_secs_name_from_sql(sec_ids=[], index_code="A"):
    """
    获取最新日期A股或港股股票名称,数据格式为 {股票代码:股票名称}

    @sec_id (list): 股票列表 默认为空 表示最新日期的所有A股
    @index_code (str): 数据库名称
    :return: {sec_id: sec_name}
    """

    last_date = get_schema('index_contents')[index_code]['end date']

    dbpath = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(last_date[:4]))
    with SqliteProxy(log=False) as proxy:
        proxy.connect(dbpath)
        if len(sec_ids) == 0:  # 默认取所有股票
            query = "SELECT sec_id, sec_name FROM [{}] WHERE date = '{}'".format(
                index_code, last_date)
        elif len(sec_ids) == 1:
            query = "SELECT sec_id, sec_name FROM [{}] WHERE date = '{}' AND sec_id = '{}'".format(
                index_code, last_date, sec_ids[0])
        else:
            query = "SELECT sec_id, sec_name FROM [{}] WHERE date = '{}' AND sec_id in {}".format(
                index_code, last_date, tuple(sec_ids))
        df = proxy.query_as_dataframe(query).set_index("sec_id")

        if len(df) == 0:
            Logger.warn("Empty result for query contents from {} on {}".format(
                index_code, last_date))

        output = {
            sec: df.at[sec, "sec_name"]
            for sec in sec_ids if sec in df.index
        }
        return output
Esempio n. 2
0
def get_index_weights(index_code, date=""):
    """
    读取单个日期指数成分股的权重

    @index_code (str): 指数代码,目前支持 ['000016.SH', '000300.SH', '000905.SH']
    @date (%Y-%m-%d): 单个日期
    :return: {sec_id: weight}
    """

    if not date:
        Logger.error("Empty date")
        raise ValueError

    if index_code not in ['000016.SH', '000300.SH', '000905.SH']:
        Logger.error("Invalid index code: {}".format(index_code))

    dbpath = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(date[:4]))
    with SqliteProxy(log=False) as proxy:
        proxy.connect(dbpath)
        query = "SELECT sec_id, weight FROM [{}] WHERE date = '{}' ".format(
            index_code, date)
        df = proxy.query_as_dataframe(query)

        if len(df) == 0:
            Logger.warn("Empty result when reading {} at {}".format(
                index_code, date))
            output = {}
        else:
            output = {
                df.at[i, 'sec_id']: df.at[i, 'weight']
                for i in range(len(df))
            }

        return output
Esempio n. 3
0
def get_factor_return_daily(factor_return_name, trading_days=[]):
    """
    从本地数据库中获取某段日期某个factor_return的日收益率

    @factor_return_name (str): factor名称
    @trading_days (['%Y-%m-%d']): 日期列表
    :return: DataFrame, index: date, columns: [sec_id, group01-group10, factor]
    """

    if factor_return_name not in get_schema(
            "factor_return"):  # 判断所给定的factor_return是否存在本地factor库中
        Logger.error(
            "{} is not in FACTOR_RETURN library".format(factor_return_name))
        return
    else:
        filepath = os.path.join(DB_FACTOR_RETURN_PATH,
                                "{}.csv".format(factor_return_name))
        df_info = open_csv_as_df(filepath, validate=True)

        if not trading_days:
            output = df_info.copy()
        else:
            output = df_info[df_info.date.isin(trading_days)]
            not_found_dates = set(trading_days) - set(output["date"].tolist())
            if not_found_dates:
                Logger.warn(
                    "Following dates are invalid: {}".format(not_found_dates))
                return
        output = output.set_index(['date'])
        return output
Esempio n. 4
0
def update_index_contents_to_sql(index_code,
                                 trading_days,
                                 override,
                                 log=False):
    with SqliteProxy(log=log) as proxy:
        date_classfier = classify_dates_by_year(trading_days)

        for year, date_list in date_classfier.items():
            path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year))
            proxy.connect(path)
            if index_code not in proxy.list_tables:
                create_table(proxy, "index_contents", index_code)

            # 判断已有数据
            query = "SELECT DISTINCT(date) FROM [{}]".format(index_code)
            lookup = proxy.query_as_dataframe(query)
            lookup = set(lookup['date'].tolist())

            for date in date_list:
                if date in lookup and not override:  # 更新的日期已经存在于数据库时,不覆盖则跳过
                    if log:
                        Logger.warn("{} records on {} is existed.".format(
                            index_code, date))
                    continue

                try:
                    loader = LOADER_MAP[index_code]
                    df = loader(index_code, date)
                    df['date'] = date
                except Exception:
                    Logger.error("Error occurred when loading {} on {}".format(
                        index_code, date))
                    raise ValueError

                if df is not None:  # 从Wind下载数据成功时
                    try:
                        if date in lookup and override:  # 覆盖时删除原记录
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}'".format(
                                    index_code, date))

                        proxy.write_from_dataframe(df, index_code)
                    except Exception:
                        Logger.error(
                            "Error occurred when writing {} on {}".format(
                                index_code, date))
                        traceback.print_exc()
                        raise ValueError

                    Logger.info("{} on {} is updated successfully".format(
                        index_code, date))
                else:  # 从wind提取数据失败时
                    Logger.error("Fail to fetch {} data on {}".format(
                        index_code, date))
                    raise ValueError

    update_schema('index_contents', index_code)
Esempio n. 5
0
def get_index_contents_on_multidays(index_code, trading_days=[], log=False):
    """
    读取多个日期某指数全部股票列表

    @index_code (str): 指数代码,目前支持 ['A', '000905.SH', '000300.SH', '000016.SH']
    @trading_days (['%Y-%m-%d']): 日期列表
    @log (Bool): 是否打印log
    :return: ({date: list}), key为date value为 股票代码列表
    """

    if log:
        Logger.info(
            "Reading all {} records between trading_days ...".format(
                index_code), "green")

    if len(trading_days) == 0:
        Logger.error("Empty date")
        raise ValueError
    elif len(trading_days) == 1:
        date = trading_days[0]
        return {date: get_index_contents(index_code, date, log=False)}

    output = {}
    if index_code in IDXCONT_AS_SQL:
        with SqliteProxy(log=log) as proxy:
            for year, date_list in classify_dates_by_year(
                    trading_days).items():
                path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(year))
                proxy.connect(path)

                query = "SELECT date, sec_id FROM [{}] WHERE date IN {}".format(
                    index_code, tuple(date_list))
                try:
                    df = proxy.query_as_dataframe(query)
                except Exception:
                    Logger.error(
                        "Empty result when reading {} from {} to {}".format(
                            index_code, trading_days[0], trading_days[-1]))
                    traceback.print_exc()
                    raise ValueError

                if len(df) == 0:
                    Logger.warn(
                        "Empty result when reading {} from {} to {}".format(
                            index_code, trading_days[0], trading_days[-1]))

                for date in date_list:
                    output[date] = df[df.date == date]['sec_id'].tolist()
    elif index_code in IDXCONT_AS_CSV:
        info = get_index_contents_from_csv(index_code)
        output = {date: info for date in trading_days}
    else:
        Logger.error("Unrecognized index code: {}".format(index_code))
        raise ValueError
    return output
Esempio n. 6
0
def update_indicators(indicators=[],
                      trading_days=[],
                      sec_ids=[],
                      override=False,
                      log=False):
    """
    更新多个indicator的指定日期列表的数据

    @indicators (list): indicator的名称构成的列表
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖
    @log (Bool): 是否打印log
    """

    SCHEMA = get_schema('indicator')
    if not indicators:
        indicators = list(SCHEMA.keys())

    start = trading_days[0]
    end = trading_days[-1]

    update_days_map = {
        "财报数据": set(get_report_days(start, end)),
        "时间序列": set(get_trading_days(start, end)),
    }

    for ind in indicators:
        if ind in SCHEMA:
            # 更新日期取交集
            itype = SCHEMA[ind]['type']
            update_days = [
                t for t in trading_days if t in update_days_map[itype]
            ]
            if not update_days:
                Logger.warn("No valid days to update!")
            else:
                update_single_indicator(indicator=ind,
                                        trading_days=update_days,
                                        sec_ids=sec_ids,
                                        override=override,
                                        log=log)
        else:
            Logger.error("Unrecognized indicator: {}".format(ind))
Esempio n. 7
0
def calculate_factor(factor, date):
    """
    通过对indicator的计算得到因子的值

    :param: factor (str): 该factor的名字
    :param: date (%Y-%m-%d): 日期
    :return: dataframe 处理后的因子值
    """
    func = getattr(formula, "calculate_raw_{}".format(factor))
    if func is None:
        Logger.error("Formula not implemented: {}".format(factor))
        raise ValueError
    context, df_today, missing_flag = load_context(factor, date)
    last_day = get_previous_existed_day_in_table(date, DB_FACTOR, factor)
    if missing_flag == 1:
        if last_day is None:  # 无最新数据
            Logger.error("当前日期数据缺失值太多,且之前没有可以复制的文件")
            raise ValueError
        else:
            Logger.warn("由于 {} 值缺失太多直接复制于 {}".format(date, last_day))
            try:
                df_last = get_secs_factor(factor,
                                          sec_ids=[],
                                          date=last_day,
                                          log=False)
            except Exception:
                traceback.print_exc()
                Logger.warn("无法提取 {} 上个记录日的数据".format(factor))
                raise ValueError
            value = df_today.merge(df_last,
                                   how="left",
                                   left_on='sec_id',
                                   right_index=True)
            return value
    else:
        data_raw = func(context)
        data_final = statistical_process(  # 数据处理: 缺失值分离 winsorize 标准化
            data=data_raw,
            var=factor,
            winsor_LB=WINSORIZE_LB,
            winsor_UB=WINSORIZE_UB)
        return data_final
Esempio n. 8
0
def get_index_contents_from_sql(index_code, date="", log=False):
    path = os.path.join(DB_INDEX_CONTENTS, '{}.db'.format(date[:4]))
    with SqliteProxy(log=log) as proxy:
        proxy.connect(path)
        query = "SELECT sec_id FROM [{}] WHERE date = '{}'".format(
            index_code, date)
        try:
            df = proxy.query_as_dataframe(query)
        except Exception:
            Logger.error("Error occurred when reading {} at {}".format(
                index_code, date))
            traceback.print_exc()
            raise ValueError

    if len(df) == 0:
        Logger.warn("Empty result when reading {} at {}".format(
            index_code, date))
        return []

    return df["sec_id"].tolist()
Esempio n. 9
0
def update_index_std(index, cp=3, log=False):
    """
    更新index_std
    更新原理: 无需指定trading_days 更新全部index中有的日期但在index_std中没有的日期
    @index <str>: index名称 不是index_std名称
    @cp <int>: winsorize的临界值
    """

    trading_days = get_unique_datelist_from_table("index", index)
    existed_days = get_unique_datelist_from_table("index_std", "{}_std".format(index))
    update_days = sorted(list(set(trading_days) - set(existed_days)))
    if len(update_days) == 0:
        Logger.warn("All given dates has existed. No need to update!!")
        return
    output = process_ts_index(index, update_days, cp)
    if len(output) == 0:
        Logger.error("Fail to process {} on given dates".format(index))
    df2mysql(USER, PASSWORD, "index_std", index + '_std', output)
    del output, trading_days, update_days
    gc.collect()
    Logger.info("Updated successfully!!")
Esempio n. 10
0
def get_secs_name(sec_ids=[]):
    """
    获取最新日期股票名称,自动处理A股和H股,数据格式为 {股票代码:股票名称}

    @sec_id (list): 股票列表 默认为空 表示最新日期的所有A股和H股
    :return: {sec_id: sec_name}
    """

    classifier = classify_equity(sec_ids)
    output = {}

    if classifier["A股"]:
        output.update(get_secs_name_from_sql(classifier["A股"], "A"))

    if classifier["港股"]:
        output.update(get_secs_name_from_csv(classifier["港股"], "H"))

    if classifier["其他"]:
        Logger.warn("Unrecognized sec_ids: {}".format(classifier["其他"]))

    return output
Esempio n. 11
0
def get_secs_industry(industry_code, sec_ids=[], date=""):
    """
    获取某日期某些股票的的行业分类信息,数据格式 {股票代码:行业分类}

    @industry_code (str): 子数据库名称,目前支持 ["A_SWL1", "H_SWL1", "H_GICSL1"]
    @sec_ids: (list) 股票列表
    @date: ("%Y-%m-%d") 单个日期
    return: {sec_id: industry},不存在则忽略
    """

    if len(sec_ids) == 0:
        Logger.warn("Empty sec_ids when reading {} on {}!".format(industry_code, date))
        return {}

    if industry_code in INDUSTRY_AS_SQL:
        output = get_secs_industry_from_sql(industry_code, sec_ids, date)
    elif industry_code in INDUSTRY_AS_JSON:
        output = get_secs_industry_from_json(industry_code, sec_ids)
    else:
        Logger.error("Unrecognized industry code: {}".format(industry_code))
        raise ValueError
    return output
Esempio n. 12
0
def get_secs_industry_SWL1(sec_ids=[], date=""):
    """
    获取某日期某些股票的的申万一级行业分类信息,自动处理A股和H股,数据格式 {股票代码:行业分类}

    @sec_id: (list) 股票列表
    @date: (%Y-%m-%d) 单个日期
    """

    if len(sec_ids) == 0:
        Logger.warn("Empty sec_ids when reading SWL1 on {}!".format(date))
        return {}

    classfier = classify_equity(sec_ids)

    output_A = {}
    if classfier['A股']:
        output_A = get_secs_industry(industry_code="A_SWL1", sec_ids=classfier['A股'], date=date)

    output_H = {}
    if classfier['港股']:
        output_H = get_secs_industry(industry_code="H_SWL1", sec_ids=classfier['港股'], date=date)

    output_A.update(output_H)
    return output_A
Esempio n. 13
0
def update_single_indicator(indicator,
                            sec_ids=[],
                            trading_days=[],
                            override=False,
                            log=False):
    """
    更新单个indicator的指定日期列表的数据

    @indicator (str): 单个indicator的名称
    @sec_ids<list> : 股票代码列表
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (Bool): 是否覆盖原记录 默认为False 表示不覆盖
    @log (Bool): 是否打印log
    """

    if log:
        Logger.info("Updating indicator {}".format(indicator), "green")

    if indicator not in get_schema('indicator'):
        Logger.error("Unrecognized indicator: {}".format(indicator))
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with SqliteProxy(log=log) as proxy:
        date_classfier = classify_dates_by_year(trading_days)

        for year, date_list in date_classfier.items():
            path = os.path.join(DB_INDICATOR, '{}.db'.format(year))
            proxy.connect(path)

            if indicator not in proxy.list_tables:
                create_table(proxy, "indicator", indicator)

            # 判断已有数据
            if len(date_list) == 1:
                query = "SELECT DISTINCT(date) FROM {} WHERE date = '{}'".format(
                    indicator, date_list[0])
            else:
                query = "SELECT DISTINCT(date) FROM {} WHERE date in {}".format(
                    indicator, tuple(date_list))
            lookup = proxy.query_as_dataframe(query)
            lookup = set(lookup['date'].tolist())

            for date in date_list:
                if date in lookup and not override:  # 更新的日期已经存在于数据库时,不覆盖则跳过
                    if log:
                        Logger.warn("{} records on {} is existed.".format(
                            indicator, date))
                    continue

                try:
                    df = load_single_indicator_on_single_day_from_wind(
                        indicator=indicator, sec_ids=sec_ids, date=date)
                except Exception:
                    Logger.error("Error occurred when loading {} on {}".format(
                        indicator, date))
                    raise ValueError

                if df is not None:  # 从Wind下载数据成功时
                    if date in lookup and override:  # 覆盖时删除原记录
                        if len(sec_ids) == 0:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}'".format(
                                    indicator, date))
                        if len(sec_ids) == 1:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}' and sec_id = '{}'"
                                .format(indicator, date, sec_ids[0]))
                        else:
                            proxy.execute(
                                "DELETE FROM [{}] WHERE date = '{}' and sec_id in {}"
                                .format(indicator, date, tuple(sec_ids)))
                    df['date'] = date
                    try:
                        proxy.write_from_dataframe(df, indicator)
                    except Exception:
                        Logger.error(
                            "Error occurred when writing {} on {}".format(
                                indicator, date))
                        traceback.print_exc()
                        raise ValueError
                    if log:
                        Logger.info("{} on {} is updated successfully".format(
                            indicator, date))

                else:  # 从wind提取数据失败时
                    Logger.error("Fail to fetch {} data on {}".format(
                        indicator, date))
                    raise ValueError

    update_schema(db_name="indicator", sub_name=indicator)

    if log:
        Logger.info("indicator {} is updated.".format(indicator),
                    color="green")
        Logger.info("------------------------------------------")
Esempio n. 14
0
def update_single_factor(factor, trading_days=[], override=False, log=False):
    """
    更新单个factor的指定日期列表的数据

    @factor (str): factor名称
    @trading_days ([%Y-%m-%d]): 日期列表
    @override (Bool): 是否覆盖原记录,默认为False,表示不覆盖
    @log (Bool): 是否打印log
    """

    Logger.info("Updating factor {}".format(factor), "green")

    _n_updated_date = 0

    if factor not in get_schema('factor'):
        Logger.error("Unrecognized factor: {}".format(factor))
        raise ValueError

    if not trading_days:
        Logger.error("Empty date")
        raise ValueError

    with SqliteProxy(log=log) as proxy:
        date_classfier = classify_dates_by_year(trading_days)

        for year, date_list in date_classfier.items():
            path = os.path.join(DB_FACTOR, '{}.db'.format(year))
            proxy.connect(path)

            if factor not in proxy.list_tables:
                create_table(proxy, "factor", factor)

            # 判断已有数据
            if len(date_list) == 1:
                query = "SELECT DISTINCT(date) FROM {} WHERE date = '{}'".format(
                    factor, date_list[0])
            else:
                query = "SELECT DISTINCT(date) FROM {} WHERE date in {}".format(
                    factor, tuple(date_list))
            lookup = proxy.query_as_dataframe(query)
            lookup = set(lookup['date'].tolist())

            for date in date_list:
                if date in lookup and not override:  # 更新的日期已经存在于数据库时,不覆盖则跳过
                    if log:
                        Logger.warn("{} records on {} is existed.".format(
                            factor, date))
                    continue

                try:
                    df = load_single_factor_on_single_day(factor=factor,
                                                          date=date)
                except Exception:
                    Logger.error("Error occurred when loading {} on {}".format(
                        factor, date))
                    traceback.print_exc()
                    continue

                if df is not None:  # 成功取得indicator
                    if date in lookup and override:  # 覆盖时删除原记录
                        proxy.execute(
                            "DELETE FROM [{}] WHERE date = '{}'".format(
                                factor, date))

                    df['date'] = date
                    try:
                        proxy.write_from_dataframe(df, factor)
                    except Exception:
                        Logger.error(
                            "Error occurred when writing {} on {}".format(
                                factor, date))
                        traceback.print_exc()
                        raise ValueError

                    if log:
                        Logger.info("{} on {} is updated successfully".format(
                            factor, date))
                    _n_updated_date += 1
                else:  # 从wind提取数据失败时
                    Logger.error("Fail to fetch {} data on {}".format(
                        factor, date))
                    raise ValueError

    update_schema(db_name="factor", sub_name=factor)

    if log:
        _n_all_date = len(trading_days)
        _n_existed_date = _n_all_date - _n_updated_date
        Logger.info("传入日期数:{}  已经存在个数:{}  实际写入次数:{}".format(
            _n_all_date, _n_existed_date, _n_updated_date))
        Logger.info("factor {} is updated.".format(factor), color="green")
        Logger.info("------------------------------------------")
Esempio n. 15
0
def update_single_factor_return(factor_return,
                                trading_days=[],
                                group_num=10,
                                log=True):
    """
    根据trading_days更新factor_return数据

    @factor_return (<str>): factor的名称
    @trading_days (<[%Y-%m-%d]>) : 日期列表
    @group_num (<int>): 分组个数
    """

    if log:
        Logger.info("Updating factor_return {}...".format(factor_return))

    if factor_return not in get_schema("factor_return"):
        Logger.error("Unrecognized factor_return: {}".format(factor_return))
        return
    factor_path = DB_PATH_LIB['factor']
    factor_exist_dates = get_date_lists_in_table(factor_path, factor_return)
    not_found_date = list(set(trading_days) - set(factor_exist_dates))
    if len(not_found_date) != 0 and log:
        Logger.warn(
            "Fail to update these factor returns on following dates due to lack factor:{}"
            .format(not_found_date))

    trading_days = list(set(trading_days) - set(not_found_date))
    if len(trading_days) == 0:
        Logger.error("No valid date to update")
        return

    trading_days = sorted(trading_days)
    db_factor_return_path = os.path.join(DB_PATH, "factor_return")
    filepath = os.path.join(db_factor_return_path,
                            '{}.csv'.format(factor_return))
    df_new = load_single_factor_return_on_multidays(factor_return,
                                                    trading_days, group_num)

    _n_updated_date = len(df_new)

    if not os.path.exists(filepath):  # 没有已经更新过的记录
        Logger.info("首次更新 {}数据".format(factor_return))
        output = df_new.copy()
        output.to_csv(filepath, encoding="utf-8")
    else:
        df_old = pd.read_csv(filepath, encoding="utf-8")  # 已经存在的所有return数据
        min_exist_date = normalize(df_old['date'].min(), "%Y-%m-%d")
        max_exist_date = normalize(df_old['date'].max(), "%Y-%m-%d")
        max_update_date = trading_days[-1]
        min_update_date = trading_days[1]  # 因为只能第二个日期才能计算收益

        if (max(min_update_date, max_update_date) < min_exist_date) or \
           (min(min_update_date, max_update_date) > max_exist_date):
            Logger.error("非法更新:待更新时间段孤立于现有的时间段")
            Logger.error("开始更新日期:{}  结束更新日期:{}".format(min_update_date,
                                                       max_update_date))
            Logger.error("原有开始日期:{}  原有结束日期:{}".format(min_exist_date,
                                                       max_exist_date))
            return

        if (min_update_date < min_exist_date) and \
           (max_update_date <= max_exist_date) and \
           (max_update_date >= min_exist_date):
            Logger.info("左更新:更新之前记录")

        if (min_update_date >= min_exist_date) and (max_update_date <=
                                                    max_exist_date):
            Logger.info("存量更新:更新当前已经有的记录")

        if (min_update_date >= min_exist_date) and \
           (min_update_date <= max_exist_date) and \
           (max_update_date > max_exist_date):
            Logger.info("右更新:更新未来的记录")

        if (min_update_date < min_exist_date) and \
           (max_update_date > max_exist_date):
            Logger.info("全更新:当前已经存在的日期是待更新日期的子集")

        df_old['date'] = df_old['date'].apply(
            lambda x: normalize(x, "%Y-%m-%d"))
        df_new['date'] = df_new['date'].apply(
            lambda x: normalize(x, "%Y-%m-%d"))
        bool_list = df_old['date'].isin(df_new['date']).apply(
            lambda x: not x)  # 旧数据不在更新日期中为True
        # 取出那些不在本次更新范围内但原数据已经存在的日期列表 这些日期直接copy 无需计算
        df_old = df_old[bool_list]
        output = df_old.append(df_new).sort_values(by=['date'])

    output = output.set_index(['date'])
    format_var_name_list = [
        'group{:0>2}'.format(i) for i in range(1, group_num + 1)
    ]
    format_var_name_list.append('{}'.format(factor_return))
    output = output.reindex(columns=format_var_name_list)
    output.to_csv(filepath, encoding="utf-8")

    update_factor_return_schema(factor_return)
    if log:
        _n_all_date = len(output)
        _n_existed_date = _n_all_date - _n_updated_date

        Logger.info("传入日期数:{}  已经存在个数:{}  实际写入次数:{}".format(
            _n_all_date, _n_existed_date, _n_updated_date))
        Logger.info("factor_return {} is updated.".format(factor_return),
                    color="green")
        Logger.info("------------------------------------------")