def _generate_path(code, freq, tdx_code): # code = code.upper() # standard_freq = standard_freq.lower() ext = { 'D': '.day', '5min': '.lc5', '1min': '.lc1', } dir = { 'D': 'lday', '5min': 'fzline', '1min': '.minline', } try: if tdx_code == 'sz': dir_name = '{}{}{}'.format(_SZ_DIR, os.sep, dir[freq]) filename = tdx_code + code + ext[freq] elif tdx_code == 'sh': dir_name = '{}{}{}'.format(_SH_DIR, os.sep, dir[freq]) filename = tdx_code + code + ext[freq] else: dir_name = '{}{}{}'.format(_DS_DIR, os.sep, dir[freq]) filename = tdx_code + '#' + code + ext[freq] except KeyError: util_log_info('Not supported Frequency {}!'.format(freq)) return file_path = os.path.join(dir_name, filename) return file_path
def _get_tdx_code_from_security_dataframe(code, exchange): try: recorder = SECURITY_DATAFRAME.loc[code] except: util_log_info("Can't get tdx_code from {}".format(code)) return if isinstance(recorder, pd.Series): return recorder['tdx_code'] try: return recorder.loc[recorder['exchange'] == exchange].loc[code, 'tdx_code'] except: util_log_info( 'Not only one {} in the list , please provide exchange or instrument' .format(code)) return recorder.tdx_code[0]
def parse_frequency_str(freq: str): """ * 'Y', 'y', 'year' * 'Q', 'q', 'quarter' * 'M', 'month' * 'W', 'w', 'weeks', 'week' * 'D', 'd', 'days', 'day' * 'H', 'hours', 'hour', 'hr', 'h' * 'm', 'minute', 'min', 'minutes', 'T' * 'S', 'seconds', 'sec', 'second' * 'ms', 'milliseconds', 'millisecond', 'milli', 'millis', 'L' * 'us', 'microseconds', 'microsecond', 'micro', 'micros', 'U' * 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N' """ pattern = "^(?P<number>\d*)(?P<unit>[a-zA-z]+)" try: freq_dict = re.match(pattern, freq).groupdict() except: util_log_info('Wrong frequency format: {}'.format(freq)) raise ValueError number = freq_dict['number'] unit = freq_dict['unit'] if unit in ['Y', 'y', 'year']: return 'Y' elif unit in ['Q', 'q', 'quarter']: return 'Q' elif unit in ['M', 'month']: return 'M' elif unit in ['W', 'w', 'weeks', 'week']: return 'w' elif unit in ['D', 'd', 'days', 'day']: return 'D' elif unit in ['H', 'hours', 'hour', 'hr', 'h']: if number: return str(number * 60) + 'min' else: return '60min' elif unit in ['m', 'minute', 'min', 'minutes', 'T']: return number + 'min' else: util_log_info('Wrong frequency format: {}'.format(freq)) raise ValueError
def get_financial_scores(): # 根据财务指标选择对公司打分 today = datetime.today() year = today.year - 4 start = datetime(year, today.month, today.day).strftime('%Y-%m-%d') total_reports_df = fetch_financial_report(start=start) code_list = total_reports_df.index.get_level_values( level=1).drop_duplicates() scores_df = pd.DataFrame(index=code_list, columns=['finance', 'holders']) # scores = pd.Series(index=code_list, dtype='float16', name='score') for code in code_list: try: df = total_reports_df.loc[(slice(None), code), :] except: continue util_log_info("Calculate {} financial scores!".format(code)) findata = FinancialStruct(df) length = min(len(df), 12) factor = findata.financial_factor.iloc[:length].reset_index(level=1, drop=True) weight_list = list(range(length, 0, -1)) weight = pd.Series(weight_list, name='weight') * 10 / sum(weight_list) weight.index = factor.index financial_score_df = pd.DataFrame(index=factor.index, columns=factor.columns) financial_score_df['ROIC'] = factor['ROIC'].apply( lambda x: 1 if x > threshold_dict['ROIC'] else 0 if x > 0 else -1) financial_score_df['grossProfitMargin'] = factor[ 'grossProfitMargin'].apply(lambda x: 1 if x > threshold_dict[ 'grossProfitMargin'] else 0 if x > 0 else -1) financial_score_df['netProfitMargin'] = factor[ 'netProfitMargin'].apply(lambda x: 1 if x > threshold_dict[ 'netProfitMargin'] else 0 if x > 0 else -1) financial_score_df['netProfitCashRatio'] = factor[ 'netProfitCashRatio'].apply(lambda x: 1 if x > threshold_dict[ 'netProfitCashRatio'] else 0 if x > 0 else -1) financial_score_df['operatingIncomeGrowth'] = factor[ 'operatingIncomeGrowth'].apply( lambda x: 1 if x > threshold_dict['operatingIncomeGrowth' ] else 0 if x > 0 else -1 if x > -threshold_dict['operatingIncomeGrowth'] else -2) financial_score_df['continuedProfitGrowth'] = factor[ 'continuedProfitGrowth'].apply( lambda x: 1 if x > threshold_dict['continuedProfitGrowth' ] else 0 if x > 0 else -1 if x > -threshold_dict['continuedProfitGrowth'] else -2) financial_score_df['assetsLiabilitiesRatio'] = factor[ 'assetsLiabilitiesRatio'].apply(lambda x: 1 if x < threshold_dict[ 'assetsLiabilitiesRatio'] else 0) financial_score_df['cashRatio'] = factor['cashRatio'].apply( lambda x: 1 if x > threshold_dict['cashRatio'] else 0) financial_score_df['inventoryRatio'] = factor['inventoryRatio'].apply( lambda x: 1 if x < threshold_dict['inventoryRatio'] else 0) # 小于0取正值会有问题,一般情况影响不大 financial_score_df['interestCoverageRatio'] = factor[ 'interestCoverageRatio'].apply(lambda x: 0 if (threshold_dict[ 'interestCoverageRatio'][0] < x < threshold_dict[ 'interestCoverageRatio'][1]) else 1) scores_df.loc[code, 'finance'] = (financial_score_df.T * weight).sum().sum() columns = [ 'QFIISharesRatio', 'brokerSharesRatio', 'securitySharesRatio', 'fundsSharesRatio', 'socialSecuritySharesRatio', 'privateEquitySharesRatio', 'financialCompanySharesRatio', 'pensionInsuranceAgencySharesRatio' ] holder = findata.holders_factor.iloc[0][columns] scores_df.loc[code, 'holders'] = holder.sum() return scores_df
def save_financial_files(): """ 将tdx目录下的gpcw财务数据存储到mongo数据库 """ coll = QA_DATABASE.financial coll.create_index([("code", ASCENDING), ("report_date", ASCENDING)], unique=True) df = pd.DataFrame(os.listdir(_CW_DIR), columns=['filename']) pattern = "^(gpcw)(?P<date>\d{8})\.zip" # gpcw20210930.dat df['re'] = df['filename'].apply(lambda x: re.match(pattern, x)) df = df.dropna() df['date'] = df['re'].apply(lambda x: int(x.groupdict()['date'])) df['last_modified'] = df['filename'].apply(lambda x: pd.to_datetime( os.path.getmtime(os.path.join(_CW_DIR, x)), unit='s')) last_modified = df.sort_values(by='last_modified', ascending=[False])['last_modified'].iloc[0] last_modified = pd.to_datetime(last_modified.strftime('%Y-%m-%d')) df = df[df['last_modified'] > last_modified] df.sort_values(by='last_modified', ascending=[False]).head() for filename in df['filename'].to_list(): try: date = int(re.match(pattern, filename).groupdict()['date']) except: continue util_log_info('NOW SAVING {}'.format(date)) util_log_info('在数据库中的条数 {}'.format( coll.count_documents({'report_date': date}))) try: filename = os.path.join(_CW_DIR, filename) df = HistoryFinancialReader().get_df(filename) # 修改columns的名称 columns = df.columns.to_list() col = {} for name in columns[1:]: col[name] = '00{}'.format(name[3:])[-3:] df.rename(columns=col, inplace=True) data = util_to_json_from_pandas(df.reset_index().drop_duplicates( subset=['code', 'report_date']).sort_index()) util_log_info('即将更新的条数 {}'.format(len(data))) try: for d in data: coll.update_one( { 'code': d['code'], 'report_date': d['report_date'] }, {'$set': d}, upsert=True) except Exception as e: if isinstance(e, MemoryError): coll.insert_many(data, ordered=True) elif isinstance(e, pymongo.bulk.BulkWriteError): pass except Exception as e: util_log_info('似乎没有数据') util_log_info('SUCCESSFULLY SAVE/UPDATE FINANCIAL DATA')
def fetch_financial_report(code=None, start=None, end=None, report_date=None, ltype='EN', db=QA_DATABASE): """ 获取专业财务报表 :parmas code: 股票代码或者代码list report_date: 8位数字 ltype: 列名显示的方式 :return DataFrame, 索引为report_date和code """ if isinstance(code, str): code = [code] if isinstance(report_date, str): report_date = [util_date_str2int(report_date)] elif isinstance(report_date, int): report_date = [report_date] elif isinstance(report_date, list): report_date = [util_date_str2int(item) for item in report_date] collection = db.financial num_columns = [item[:3] for item in list(financial_dict.keys())] CH_columns = [item[3:] for item in list(financial_dict.keys())] EN_columns = list(financial_dict.values()) filter = {} projection = {"_id": 0} try: if code is not None: filter.update(code={'$in': code}) if start or end: start = '1990-01-01' if start is None else str(start)[0:10] end = datetime.today().strftime( '%Y-%m-%d') if end is None else str(end)[0:10] if not util_date_valid(end): util_log_info('Something wrong with end date {}'.format(end)) return if not util_date_valid(start): util_log_info( 'Something wrong with start date {}'.format(start)) return filter.update(report_date={ "$lte": util_date_str2int(end), "$gte": util_date_str2int(start) }) elif report_date is not None: filter.update(report_date={'$in': report_date}) collection.create_index([('report_date', -1), ('code', 1)]) data = [ item for item in collection.find( filter=filter, projection=projection, batch_size=10000, # sort=[('report_date', -1)] ) ] if len(data) > 0: res_pd = pd.DataFrame(data) if ltype in ['CH', 'CN']: cndict = dict(zip(num_columns, CH_columns)) cndict['code'] = 'code' cndict['report_date'] = 'report_date' res_pd.columns = res_pd.columns.map(lambda x: cndict[x]) elif ltype is 'EN': endict = dict(zip(num_columns, EN_columns)) endict['code'] = 'code' endict['report_date'] = 'report_date' res_pd.columns = res_pd.columns.map(lambda x: endict[x]) if res_pd.report_date.dtype == numpy.int64: res_pd.report_date = pd.to_datetime( res_pd.report_date.apply(util_date_int2str)) else: res_pd.report_date = pd.to_datetime(res_pd.report_date) return res_pd.replace(-4.039810335e+34, numpy.nan).set_index(['report_date', 'code'], # drop=False ) else: return None except Exception as e: raise e
def _get_sh_sz_list(): """ 读取上海深圳交易所行情目录的文件列表,并对市场,品种和代码分类 sh000015.day 期货 ('28', 'AP2003') 'sse' # 上海证券交易所 sh 6位数字代码 前两位 "60" A股 "90" B股 "00", "88", "99" 指数 "50", "51" 基金 "01", "10", "11", "12", "13", "14" 债券,和深圳有重合 110 可转债 对应股票代码 600 111 601 113 可转债 对应股票代码 603 沪市中小板 118 可转债 科创板 'szse' # 深圳证券交易所 sz 6位数字代码 前两位 "00", "30" A股 "20" "39" 指数 "15", "16" 基金 "10", "11", "12", "13", "14" 债券,和深圳有重合 123 可转债 对应股票代码 300 128 可转债 对应股票代码 002 127 可转债 对应股票代码 000 pattern = "^(?P<tdx_code>[shz]{2})#(?P<code>\d{6})\.day" """ sh_dir = '{}{}{}'.format(_SH_DIR, os.sep, 'lday') sh_list = os.listdir(sh_dir) pattern = "^(?P<tdx_code>sh)(?P<code>\d{6})\.day" data = [re.match(pattern, x) for x in sh_list] try: sh_df = pd.DataFrame([x.groupdict() for x in data]) except: util_log_info("{} can't be analyzed by pattern ({}) }".format( _SH_DIR, pattern)) return None sh_df['exchange'] = 'sse' sh_df['instrument'] = sh_df.code.apply(sse_code_classify) sz_dir = '{}{}{}'.format(_SZ_DIR, os.sep, 'lday') sz_list = os.listdir(sz_dir) pattern = "^(?P<tdx_code>sz)(?P<code>\d{6})\.day" data = [re.match(pattern, x) for x in sz_list] try: sz_df = pd.DataFrame([x.groupdict() for x in data]) except: util_log_info("{} can't be analyzed by pattern ({}) }".format( _SZ_DIR, pattern)) return None sz_df['exchange'] = 'szse' sz_df['instrument'] = sz_df.code.apply(szse_code_classify) sz_df['filename'] = sz_list sz_df['last_modified'] = sz_df['filename'].apply( lambda x: os.path.getmtime(os.path.join(sz_dir, x))) sh_df['filename'] = sh_list sh_df['last_modified'] = sh_df['filename'].apply( lambda x: os.path.getmtime(os.path.join(sh_dir, x))) return pd.concat([sh_df, sz_df])
def get_bar(code, start=None, end=None, freq='day', exchange=None): """ 股票成交量 volume 单位是100股 """ code = code.upper() standard_freq = parse_frequency_str(freq) try: tdx_code = _get_tdx_code_from_security_dataframe(code, exchange) except: util_log_info("Can't get tdx_code from {}".format(code)) return if standard_freq in ['D', 'w', 'M', 'Q', 'Y']: file_path = _generate_path(code, 'D', tdx_code) elif standard_freq in ['1min', '5min', '30min', '60min']: file_path = _generate_path(code, '5min', tdx_code) elif standard_freq in ['1min']: file_path = _generate_path(code, '1min', tdx_code) else: util_log_info('Not supported frequency {}'.format(freq)) return if not os.path.exists(file_path): util_log_info('=={}== {} file is not exists!'.format(code, file_path)) return # 统一freq的数据结构 if tdx_code in ['sh', 'sz']: if standard_freq in ['D', 'w', 'M', 'Q', 'Y']: reader = TdxDailyBarReader() df = reader.get_df(file_path) elif standard_freq in ['1min', '5min', '30min', '60min']: reader = TdxLCMinBarReader() df = reader.get_df(file_path) else: util_log_info('Not supported frequency {}'.format(freq)) return else: if standard_freq in ['D', 'w', 'M', 'Q', 'Y']: reader = TdxExHqDailyBarReader() df = reader.get_df(file_path) elif standard_freq in ['1min', '5min', '30min', '60min']: reader = TdxLCMinBarReader() df = reader.get_df(file_path) else: util_log_info('Not supported frequency {}'.format(freq)) return if len(df) < 1: return recorder = SECURITY_DATAFRAME.loc[code] if isinstance(recorder, pd.DataFrame): instrument = recorder.loc[recorder['tdx_code'] == tdx_code].loc[ code, 'instrument'] exchange = recorder.loc[recorder['tdx_code'] == tdx_code].loc[ code, 'exchange'] else: instrument = recorder['instrument'] exchange = recorder['exchange'] if instrument in ['future', 'option']: df.rename(columns={ 'amount': "position", "jiesuan": "settle" }, inplace=True) if start: start = pd.to_datetime(start) df = df[df.index >= start] if end: end = pd.to_datetime(end) df = df[df.index <= end] df['date'] = df.index df = df.assign(code=code, exchange=exchange) if standard_freq in ['w', 'M', 'Q', 'Y']: df = resample_from_daily_data(df, standard_freq) return df
def _get_ds_list(): """ 读取扩展行情目录的文件列表,并对市场,品种和代码分类 47#TS2009.day 期货 ('28', 'AP2003') 7#IO760795.day 期权 ('7', 'IO760795') 5#V 7C0D49.day 期权 中间有空格,特殊处理 102#980001.day 102 国证指数 pattern = "^(?P<tdx_code>\d{1,3})#(?P<code>.+)\.day" """ DS_CODE_TO_TYPE = { '4': { 'exchange': 'czce', 'instrument': 'option' }, '5': { 'exchange': 'dce', 'instrument': 'option' }, '6': { 'exchange': 'shfe', 'instrument': 'option' }, '7': { 'exchange': 'cffex', 'instrument': 'option' }, '8': { 'exchange': 'sse', 'instrument': 'option' }, '9': { 'exchange': 'szse', 'instrument': 'option' }, '27': { 'exchange': 'hkse', 'instrument': 'index' }, # 香港指数 '28': { 'exchange': 'czce', 'instrument': 'future' }, '29': { 'exchange': 'dce', 'instrument': 'future' }, '30': { 'exchange': 'shfe', 'instrument': 'future' }, '31': { 'exchange': 'hkse', 'instrument': 'stock' }, # 香港主板 '33': { 'exchange': 'sse szse', 'instrument': 'OEF' }, # 开放式基金 '34': { 'exchange': 'sse szse', 'instrument': 'MMF' }, # 货币型基金 '44': { 'exchange': 'neeq', 'instrument': 'stock' }, # 股转系统 '47': { 'exchange': 'cffex', 'instrument': 'future' }, '48': { 'exchange': 'hkse', 'instrument': 'stock' }, # 香港创业板 '49': { 'exchange': 'hkse', 'instrument': 'TF' }, # 香港信托基金 '62': { 'exchange': 'csindex', 'instrument': 'index' }, # 中证指数 '71': { 'exchange': 'hkconnect', 'instrument': 'stock' }, # 港股通品种 '102': { 'exchange': 'sse szse', 'instrument': 'index' }, } ds_dir = '{}{}{}'.format(_DS_DIR, os.sep, 'lday') ds_list = os.listdir(ds_dir) pattern = "^(?P<tdx_code>\d{1,3})#(?P<code>.+)\.day" data = [re.match(pattern, x) for x in ds_list] try: # 注释条码用来显示pattern不能识别的文件名 # for i, x in enumerate(Data): # if not x: # util_log_info('{}'.format(ds_list[i])) ds_df = pd.DataFrame([x.groupdict() for x in data]) except: util_log_info("{} can't be analyzed by pattern ({}) }".format( _DS_DIR, pattern)) return None ds_df['exchange'] = ds_df.tdx_code.apply(lambda x: DS_CODE_TO_TYPE[x][ 'exchange'] if x in DS_CODE_TO_TYPE else None) ds_df['instrument'] = ds_df.tdx_code.apply(lambda x: DS_CODE_TO_TYPE[x][ 'instrument'] if x in DS_CODE_TO_TYPE else None) ds_df['filename'] = ds_list ds_df['last_modified'] = ds_df['filename'].apply( lambda x: os.path.getmtime(os.path.join(ds_dir, x))) return ds_df