Esempio n. 1
0
def _generate_path(code, freq, tdx_code):
    # code = code.upper()
    # standard_freq = standard_freq.lower()

    ext = {
        'D': '.day',
        '5min': '.lc5',
        '1min': '.lc1',
    }

    dir = {
        'D': 'lday',
        '5min': 'fzline',
        '1min': '.minline',
    }

    try:
        if tdx_code == 'sz':
            dir_name = '{}{}{}'.format(_SZ_DIR, os.sep, dir[freq])
            filename = tdx_code + code + ext[freq]
        elif tdx_code == 'sh':
            dir_name = '{}{}{}'.format(_SH_DIR, os.sep, dir[freq])
            filename = tdx_code + code + ext[freq]
        else:
            dir_name = '{}{}{}'.format(_DS_DIR, os.sep, dir[freq])
            filename = tdx_code + '#' + code + ext[freq]
    except KeyError:
        util_log_info('Not supported Frequency {}!'.format(freq))
        return

    file_path = os.path.join(dir_name, filename)
    return file_path
Esempio n. 2
0
def _get_tdx_code_from_security_dataframe(code, exchange):
    try:
        recorder = SECURITY_DATAFRAME.loc[code]
    except:
        util_log_info("Can't get tdx_code from {}".format(code))
        return

    if isinstance(recorder, pd.Series):
        return recorder['tdx_code']

    try:
        return recorder.loc[recorder['exchange'] == exchange].loc[code,
                                                                  'tdx_code']
    except:
        util_log_info(
            'Not only one {} in the list , please provide exchange or instrument'
            .format(code))
        return recorder.tdx_code[0]
Esempio n. 3
0
def parse_frequency_str(freq: str):
    """
    * 'Y', 'y', 'year'
    * 'Q', 'q', 'quarter'
    * 'M', 'month'
    * 'W', 'w', 'weeks', 'week'
    * 'D', 'd', 'days', 'day'
    * 'H', 'hours', 'hour', 'hr', 'h'
    * 'm', 'minute', 'min', 'minutes', 'T'
    * 'S', 'seconds', 'sec', 'second'
    * 'ms', 'milliseconds', 'millisecond', 'milli', 'millis', 'L'
    * 'us', 'microseconds', 'microsecond', 'micro', 'micros', 'U'
    * 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'
    """
    pattern = "^(?P<number>\d*)(?P<unit>[a-zA-z]+)"
    try:
        freq_dict = re.match(pattern, freq).groupdict()
    except:
        util_log_info('Wrong frequency format: {}'.format(freq))
        raise ValueError

    number = freq_dict['number']
    unit = freq_dict['unit']

    if unit in ['Y', 'y', 'year']:
        return 'Y'
    elif unit in ['Q', 'q', 'quarter']:
        return 'Q'
    elif unit in ['M', 'month']:
        return 'M'
    elif unit in ['W', 'w', 'weeks', 'week']:
        return 'w'
    elif unit in ['D', 'd', 'days', 'day']:
        return 'D'
    elif unit in ['H', 'hours', 'hour', 'hr', 'h']:
        if number:
            return str(number * 60) + 'min'
        else:
            return '60min'
    elif unit in ['m', 'minute', 'min', 'minutes', 'T']:
        return number + 'min'
    else:
        util_log_info('Wrong frequency format: {}'.format(freq))
        raise ValueError
Esempio n. 4
0
def get_financial_scores():
    # 根据财务指标选择对公司打分
    today = datetime.today()
    year = today.year - 4
    start = datetime(year, today.month, today.day).strftime('%Y-%m-%d')
    total_reports_df = fetch_financial_report(start=start)
    code_list = total_reports_df.index.get_level_values(
        level=1).drop_duplicates()
    scores_df = pd.DataFrame(index=code_list, columns=['finance', 'holders'])
    # scores = pd.Series(index=code_list, dtype='float16', name='score')

    for code in code_list:
        try:
            df = total_reports_df.loc[(slice(None), code), :]
        except:
            continue

        util_log_info("Calculate {} financial scores!".format(code))

        findata = FinancialStruct(df)
        length = min(len(df), 12)
        factor = findata.financial_factor.iloc[:length].reset_index(level=1,
                                                                    drop=True)

        weight_list = list(range(length, 0, -1))
        weight = pd.Series(weight_list, name='weight') * 10 / sum(weight_list)
        weight.index = factor.index

        financial_score_df = pd.DataFrame(index=factor.index,
                                          columns=factor.columns)

        financial_score_df['ROIC'] = factor['ROIC'].apply(
            lambda x: 1 if x > threshold_dict['ROIC'] else 0 if x > 0 else -1)
        financial_score_df['grossProfitMargin'] = factor[
            'grossProfitMargin'].apply(lambda x: 1 if x > threshold_dict[
                'grossProfitMargin'] else 0 if x > 0 else -1)
        financial_score_df['netProfitMargin'] = factor[
            'netProfitMargin'].apply(lambda x: 1 if x > threshold_dict[
                'netProfitMargin'] else 0 if x > 0 else -1)
        financial_score_df['netProfitCashRatio'] = factor[
            'netProfitCashRatio'].apply(lambda x: 1 if x > threshold_dict[
                'netProfitCashRatio'] else 0 if x > 0 else -1)

        financial_score_df['operatingIncomeGrowth'] = factor[
            'operatingIncomeGrowth'].apply(
                lambda x: 1 if x > threshold_dict['operatingIncomeGrowth'
                                                  ] else 0 if x > 0 else -1
                if x > -threshold_dict['operatingIncomeGrowth'] else -2)
        financial_score_df['continuedProfitGrowth'] = factor[
            'continuedProfitGrowth'].apply(
                lambda x: 1 if x > threshold_dict['continuedProfitGrowth'
                                                  ] else 0 if x > 0 else -1
                if x > -threshold_dict['continuedProfitGrowth'] else -2)

        financial_score_df['assetsLiabilitiesRatio'] = factor[
            'assetsLiabilitiesRatio'].apply(lambda x: 1 if x < threshold_dict[
                'assetsLiabilitiesRatio'] else 0)
        financial_score_df['cashRatio'] = factor['cashRatio'].apply(
            lambda x: 1 if x > threshold_dict['cashRatio'] else 0)
        financial_score_df['inventoryRatio'] = factor['inventoryRatio'].apply(
            lambda x: 1 if x < threshold_dict['inventoryRatio'] else 0)
        # 小于0取正值会有问题,一般情况影响不大
        financial_score_df['interestCoverageRatio'] = factor[
            'interestCoverageRatio'].apply(lambda x: 0 if (threshold_dict[
                'interestCoverageRatio'][0] < x < threshold_dict[
                    'interestCoverageRatio'][1]) else 1)

        scores_df.loc[code,
                      'finance'] = (financial_score_df.T * weight).sum().sum()

        columns = [
            'QFIISharesRatio', 'brokerSharesRatio', 'securitySharesRatio',
            'fundsSharesRatio', 'socialSecuritySharesRatio',
            'privateEquitySharesRatio', 'financialCompanySharesRatio',
            'pensionInsuranceAgencySharesRatio'
        ]
        holder = findata.holders_factor.iloc[0][columns]
        scores_df.loc[code, 'holders'] = holder.sum()

    return scores_df
Esempio n. 5
0
def save_financial_files():
    """
    将tdx目录下的gpcw财务数据存储到mongo数据库
    """
    coll = QA_DATABASE.financial
    coll.create_index([("code", ASCENDING), ("report_date", ASCENDING)],
                      unique=True)

    df = pd.DataFrame(os.listdir(_CW_DIR), columns=['filename'])

    pattern = "^(gpcw)(?P<date>\d{8})\.zip"  # gpcw20210930.dat
    df['re'] = df['filename'].apply(lambda x: re.match(pattern, x))
    df = df.dropna()
    df['date'] = df['re'].apply(lambda x: int(x.groupdict()['date']))

    df['last_modified'] = df['filename'].apply(lambda x: pd.to_datetime(
        os.path.getmtime(os.path.join(_CW_DIR, x)), unit='s'))

    last_modified = df.sort_values(by='last_modified',
                                   ascending=[False])['last_modified'].iloc[0]
    last_modified = pd.to_datetime(last_modified.strftime('%Y-%m-%d'))
    df = df[df['last_modified'] > last_modified]
    df.sort_values(by='last_modified', ascending=[False]).head()

    for filename in df['filename'].to_list():
        try:
            date = int(re.match(pattern, filename).groupdict()['date'])
        except:
            continue

        util_log_info('NOW SAVING {}'.format(date))
        util_log_info('在数据库中的条数 {}'.format(
            coll.count_documents({'report_date': date})))
        try:
            filename = os.path.join(_CW_DIR, filename)
            df = HistoryFinancialReader().get_df(filename)

            # 修改columns的名称
            columns = df.columns.to_list()
            col = {}

            for name in columns[1:]:
                col[name] = '00{}'.format(name[3:])[-3:]

            df.rename(columns=col, inplace=True)

            data = util_to_json_from_pandas(df.reset_index().drop_duplicates(
                subset=['code', 'report_date']).sort_index())
            util_log_info('即将更新的条数 {}'.format(len(data)))
            try:
                for d in data:
                    coll.update_one(
                        {
                            'code': d['code'],
                            'report_date': d['report_date']
                        }, {'$set': d},
                        upsert=True)

            except Exception as e:
                if isinstance(e, MemoryError):
                    coll.insert_many(data, ordered=True)
                elif isinstance(e, pymongo.bulk.BulkWriteError):
                    pass
        except Exception as e:
            util_log_info('似乎没有数据')

    util_log_info('SUCCESSFULLY SAVE/UPDATE FINANCIAL DATA')
Esempio n. 6
0
def fetch_financial_report(code=None,
                           start=None,
                           end=None,
                           report_date=None,
                           ltype='EN',
                           db=QA_DATABASE):
    """
    获取专业财务报表
    :parmas
        code: 股票代码或者代码list
        report_date: 8位数字
        ltype: 列名显示的方式
    :return
        DataFrame, 索引为report_date和code
    """

    if isinstance(code, str):
        code = [code]
    if isinstance(report_date, str):
        report_date = [util_date_str2int(report_date)]
    elif isinstance(report_date, int):
        report_date = [report_date]
    elif isinstance(report_date, list):
        report_date = [util_date_str2int(item) for item in report_date]

    collection = db.financial
    num_columns = [item[:3] for item in list(financial_dict.keys())]
    CH_columns = [item[3:] for item in list(financial_dict.keys())]
    EN_columns = list(financial_dict.values())

    filter = {}
    projection = {"_id": 0}
    try:
        if code is not None:
            filter.update(code={'$in': code})

        if start or end:
            start = '1990-01-01' if start is None else str(start)[0:10]
            end = datetime.today().strftime(
                '%Y-%m-%d') if end is None else str(end)[0:10]

            if not util_date_valid(end):
                util_log_info('Something wrong with end date {}'.format(end))
                return

            if not util_date_valid(start):
                util_log_info(
                    'Something wrong with start date {}'.format(start))
                return

            filter.update(report_date={
                "$lte": util_date_str2int(end),
                "$gte": util_date_str2int(start)
            })
        elif report_date is not None:
            filter.update(report_date={'$in': report_date})

        collection.create_index([('report_date', -1), ('code', 1)])

        data = [
            item for item in collection.find(
                filter=filter,
                projection=projection,
                batch_size=10000,
                # sort=[('report_date', -1)]
            )
        ]

        if len(data) > 0:
            res_pd = pd.DataFrame(data)

            if ltype in ['CH', 'CN']:

                cndict = dict(zip(num_columns, CH_columns))

                cndict['code'] = 'code'
                cndict['report_date'] = 'report_date'
                res_pd.columns = res_pd.columns.map(lambda x: cndict[x])
            elif ltype is 'EN':
                endict = dict(zip(num_columns, EN_columns))

                endict['code'] = 'code'
                endict['report_date'] = 'report_date'
                res_pd.columns = res_pd.columns.map(lambda x: endict[x])

            if res_pd.report_date.dtype == numpy.int64:
                res_pd.report_date = pd.to_datetime(
                    res_pd.report_date.apply(util_date_int2str))
            else:
                res_pd.report_date = pd.to_datetime(res_pd.report_date)

            return res_pd.replace(-4.039810335e+34,
                                  numpy.nan).set_index(['report_date', 'code'],
                                                       # drop=False
                                                       )
        else:
            return None
    except Exception as e:
        raise e
Esempio n. 7
0
def _get_sh_sz_list():
    """
    读取上海深圳交易所行情目录的文件列表,并对市场,品种和代码分类
    sh000015.day   期货    ('28', 'AP2003')
    'sse'     # 上海证券交易所       sh  6位数字代码
               前两位 "60"    A股
                     "90"    B股
                     "00", "88", "99" 指数
                     "50", "51"       基金
                     "01", "10", "11", "12", "13", "14" 债券,和深圳有重合
                     110 可转债 对应股票代码 600
                     111                  601
                     113 可转债 对应股票代码 603 沪市中小板
                     118 可转债                科创板

    'szse'    # 深圳证券交易所       sz  6位数字代码
            前两位 "00", "30"  A股
                  "20"
                  "39"       指数
                  "15", "16" 基金
                  "10", "11", "12", "13", "14" 债券,和深圳有重合
                  123 可转债 对应股票代码 300
                  128 可转债 对应股票代码 002
                  127 可转债 对应股票代码 000

    pattern = "^(?P<tdx_code>[shz]{2})#(?P<code>\d{6})\.day"
    """
    sh_dir = '{}{}{}'.format(_SH_DIR, os.sep, 'lday')
    sh_list = os.listdir(sh_dir)

    pattern = "^(?P<tdx_code>sh)(?P<code>\d{6})\.day"
    data = [re.match(pattern, x) for x in sh_list]
    try:
        sh_df = pd.DataFrame([x.groupdict() for x in data])
    except:
        util_log_info("{} can't be analyzed by pattern ({}) }".format(
            _SH_DIR, pattern))
        return None

    sh_df['exchange'] = 'sse'

    sh_df['instrument'] = sh_df.code.apply(sse_code_classify)

    sz_dir = '{}{}{}'.format(_SZ_DIR, os.sep, 'lday')
    sz_list = os.listdir(sz_dir)

    pattern = "^(?P<tdx_code>sz)(?P<code>\d{6})\.day"
    data = [re.match(pattern, x) for x in sz_list]
    try:
        sz_df = pd.DataFrame([x.groupdict() for x in data])
    except:
        util_log_info("{} can't be analyzed by pattern ({}) }".format(
            _SZ_DIR, pattern))
        return None

    sz_df['exchange'] = 'szse'

    sz_df['instrument'] = sz_df.code.apply(szse_code_classify)

    sz_df['filename'] = sz_list
    sz_df['last_modified'] = sz_df['filename'].apply(
        lambda x: os.path.getmtime(os.path.join(sz_dir, x)))

    sh_df['filename'] = sh_list
    sh_df['last_modified'] = sh_df['filename'].apply(
        lambda x: os.path.getmtime(os.path.join(sh_dir, x)))

    return pd.concat([sh_df, sz_df])
Esempio n. 8
0
def get_bar(code, start=None, end=None, freq='day', exchange=None):
    """
    股票成交量 volume 单位是100股
    """
    code = code.upper()
    standard_freq = parse_frequency_str(freq)

    try:
        tdx_code = _get_tdx_code_from_security_dataframe(code, exchange)
    except:
        util_log_info("Can't get tdx_code from {}".format(code))
        return

    if standard_freq in ['D', 'w', 'M', 'Q', 'Y']:
        file_path = _generate_path(code, 'D', tdx_code)
    elif standard_freq in ['1min', '5min', '30min', '60min']:
        file_path = _generate_path(code, '5min', tdx_code)
    elif standard_freq in ['1min']:
        file_path = _generate_path(code, '1min', tdx_code)
    else:
        util_log_info('Not supported frequency {}'.format(freq))
        return

    if not os.path.exists(file_path):
        util_log_info('=={}== {} file is not exists!'.format(code, file_path))
        return

    # 统一freq的数据结构
    if tdx_code in ['sh', 'sz']:
        if standard_freq in ['D', 'w', 'M', 'Q', 'Y']:
            reader = TdxDailyBarReader()
            df = reader.get_df(file_path)
        elif standard_freq in ['1min', '5min', '30min', '60min']:
            reader = TdxLCMinBarReader()
            df = reader.get_df(file_path)
        else:
            util_log_info('Not supported frequency {}'.format(freq))
            return
    else:
        if standard_freq in ['D', 'w', 'M', 'Q', 'Y']:
            reader = TdxExHqDailyBarReader()
            df = reader.get_df(file_path)
        elif standard_freq in ['1min', '5min', '30min', '60min']:
            reader = TdxLCMinBarReader()
            df = reader.get_df(file_path)
        else:
            util_log_info('Not supported frequency {}'.format(freq))
            return

    if len(df) < 1:
        return

    recorder = SECURITY_DATAFRAME.loc[code]

    if isinstance(recorder, pd.DataFrame):
        instrument = recorder.loc[recorder['tdx_code'] == tdx_code].loc[
            code, 'instrument']
        exchange = recorder.loc[recorder['tdx_code'] == tdx_code].loc[
            code, 'exchange']
    else:
        instrument = recorder['instrument']
        exchange = recorder['exchange']

    if instrument in ['future', 'option']:
        df.rename(columns={
            'amount': "position",
            "jiesuan": "settle"
        },
                  inplace=True)

    if start:
        start = pd.to_datetime(start)
        df = df[df.index >= start]

    if end:
        end = pd.to_datetime(end)
        df = df[df.index <= end]

    df['date'] = df.index
    df = df.assign(code=code, exchange=exchange)

    if standard_freq in ['w', 'M', 'Q', 'Y']:
        df = resample_from_daily_data(df, standard_freq)
    return df
Esempio n. 9
0
def _get_ds_list():
    """
    读取扩展行情目录的文件列表,并对市场,品种和代码分类
    47#TS2009.day   期货    ('28', 'AP2003')
    7#IO760795.day  期权    ('7', 'IO760795')
    5#V 7C0D49.day  期权 中间有空格,特殊处理
    102#980001.day  102 国证指数
    pattern = "^(?P<tdx_code>\d{1,3})#(?P<code>.+)\.day"
    """
    DS_CODE_TO_TYPE = {
        '4': {
            'exchange': 'czce',
            'instrument': 'option'
        },
        '5': {
            'exchange': 'dce',
            'instrument': 'option'
        },
        '6': {
            'exchange': 'shfe',
            'instrument': 'option'
        },
        '7': {
            'exchange': 'cffex',
            'instrument': 'option'
        },
        '8': {
            'exchange': 'sse',
            'instrument': 'option'
        },
        '9': {
            'exchange': 'szse',
            'instrument': 'option'
        },
        '27': {
            'exchange': 'hkse',
            'instrument': 'index'
        },  # 香港指数
        '28': {
            'exchange': 'czce',
            'instrument': 'future'
        },
        '29': {
            'exchange': 'dce',
            'instrument': 'future'
        },
        '30': {
            'exchange': 'shfe',
            'instrument': 'future'
        },
        '31': {
            'exchange': 'hkse',
            'instrument': 'stock'
        },  # 香港主板
        '33': {
            'exchange': 'sse szse',
            'instrument': 'OEF'
        },  # 开放式基金
        '34': {
            'exchange': 'sse szse',
            'instrument': 'MMF'
        },  # 货币型基金
        '44': {
            'exchange': 'neeq',
            'instrument': 'stock'
        },  # 股转系统
        '47': {
            'exchange': 'cffex',
            'instrument': 'future'
        },
        '48': {
            'exchange': 'hkse',
            'instrument': 'stock'
        },  # 香港创业板
        '49': {
            'exchange': 'hkse',
            'instrument': 'TF'
        },  # 香港信托基金
        '62': {
            'exchange': 'csindex',
            'instrument': 'index'
        },  # 中证指数
        '71': {
            'exchange': 'hkconnect',
            'instrument': 'stock'
        },  # 港股通品种
        '102': {
            'exchange': 'sse szse',
            'instrument': 'index'
        },
    }
    ds_dir = '{}{}{}'.format(_DS_DIR, os.sep, 'lday')
    ds_list = os.listdir(ds_dir)

    pattern = "^(?P<tdx_code>\d{1,3})#(?P<code>.+)\.day"
    data = [re.match(pattern, x) for x in ds_list]
    try:  # 注释条码用来显示pattern不能识别的文件名
        # for i, x in enumerate(Data):
        #     if not x:
        #         util_log_info('{}'.format(ds_list[i]))
        ds_df = pd.DataFrame([x.groupdict() for x in data])
    except:
        util_log_info("{} can't be analyzed by pattern ({}) }".format(
            _DS_DIR, pattern))
        return None

    ds_df['exchange'] = ds_df.tdx_code.apply(lambda x: DS_CODE_TO_TYPE[x][
        'exchange'] if x in DS_CODE_TO_TYPE else None)
    ds_df['instrument'] = ds_df.tdx_code.apply(lambda x: DS_CODE_TO_TYPE[x][
        'instrument'] if x in DS_CODE_TO_TYPE else None)

    ds_df['filename'] = ds_list
    ds_df['last_modified'] = ds_df['filename'].apply(
        lambda x: os.path.getmtime(os.path.join(ds_dir, x)))

    return ds_df