def __init__(self, entity_type: EntityType = EntityType.Stock, exchanges=None, entity_ids=None, codes=None, batch_size=10, force_update=False, sleeping_time=5, default_size=findy_config['batch_size'], real_time=False, fix_duplicate_way='add', start_timestamp=None, end_timestamp=None, close_hour=0, close_minute=0, share_para=None) -> None: self.default_size = default_size self.real_time = real_time self.close_hour = close_hour self.close_minute = close_minute self.fix_duplicate_way = fix_duplicate_way self.start_timestamp = to_pd_timestamp(start_timestamp) self.end_timestamp = to_pd_timestamp(end_timestamp) super().__init__(entity_type, exchanges, entity_ids, codes, batch_size, force_update, sleeping_time, share_para=share_para)
def format(self, content, exchange): df = pd.DataFrame(content) if df is not None: df.rename(columns={ 'symbol': 'code', 'ipoyear': 'list_date', 'marketCap': 'market_cap' }, inplace=True) timestamp_str = self.get_original_time_field() df[timestamp_str] = df[timestamp_str].apply( lambda x: to_pd_timestamp(x)) df.fillna({timestamp_str: to_pd_timestamp('1980-01-01')}, inplace=True) df['timestamp'] = df[timestamp_str] df['entity_type'] = EntityType.Stock.value df['exchange'] = exchange df['is_active'] = True df['code'] = df['code'].str.strip() df['id'] = self.generate_domain_id(exchange, df) df['entity_id'] = df['id'] df.drop_duplicates(subset=('id'), keep='last', inplace=True) return df
def format(self, entity, df): df.rename(columns={'symbol': 'code', 'market': 'sector', 'province': 'state', 'employees': 'fulltime_employees', 'reg_capital': 'market_cap', 'setup_date': 'date_of_establishment'}, inplace=True) timestamp_str = self.get_original_time_field() df[timestamp_str] = df[timestamp_str].apply(lambda x: to_pd_timestamp(x)) df.fillna({timestamp_str: to_pd_timestamp('1980-01-01')}, inplace=True) df['timestamp'] = df[timestamp_str] df['entity_type'] = EntityType.StockDetail.value df['code'] = df['code'].str.strip() df['id'] = self.generate_domain_id(entity, df) df['entity_id'] = df['id'] df.drop_duplicates(subset=('id'), keep='last', inplace=True) return df
def init_timestamps(self, entity, http_session): param = { "color": "w", "fc": get_fc(entity), "DataType": self.data_type } if self.finance_report_type == 'LiRunBiaoList' or self.finance_report_type == 'XianJinLiuLiangBiaoList': param['ReportType'] = 1 timestamp_json_list = call_eastmoney_api( http_session, url=self.timestamps_fetching_url, path_fields=self.timestamp_list_path_fields, params=param) if timestamp_json_list is not None and self.timestamp_path_fields: timestamps = [ get_from_path_fields(data, self.timestamp_path_fields) for data in timestamp_json_list ] else: return [] return [to_pd_timestamp(t) for t in timestamps]
def eval_size_of_timestamp(self, start_timestamp: pd.Timestamp, end_timestamp: pd.Timestamp, level: IntervalLevel, one_day_trading_minutes): assert end_timestamp is not None time_delta = end_timestamp - to_pd_timestamp(start_timestamp) one_day_trading_seconds = one_day_trading_minutes * 60 if level == IntervalLevel.LEVEL_1DAY: return time_delta.days if level == IntervalLevel.LEVEL_1WEEK: return int(math.ceil(time_delta.days / 7)) if level == IntervalLevel.LEVEL_1MON: return int(math.ceil(time_delta.days / 30)) if time_delta.days > 0: seconds = (time_delta.days + 1) * one_day_trading_seconds return int(math.ceil(seconds / level.to_second())) else: seconds = time_delta.total_seconds() return min(int(math.ceil(seconds / level.to_second())), one_day_trading_seconds / level.to_second())
async def persist_index(self, df) -> None: df['timestamp'] = df['timestamp'].apply(lambda x: to_pd_timestamp(x)) df['list_date'] = df['list_date'].apply(lambda x: to_pd_timestamp(x)) df['id'] = df['code'].apply(lambda code: f'index_cn_{code}') df['entity_id'] = df['id'] df['exchange'] = 'cn' df['entity_type'] = EntityType.Index.value df = df.dropna(axis=0, how='any') df = df.drop_duplicates(subset='id', keep='last') db_session = get_db_session(self.region, self.provider, Index) await df_to_db(region=self.region, provider=self.provider, data_schema=Index, db_session=db_session, df=df)
async def init_main_index(region: Region, provider=Provider.Exchange): if region == Region.CHN: for item in CHINA_STOCK_MAIN_INDEX: item['timestamp'] = to_pd_timestamp(item['timestamp']) df = pd.DataFrame(CHINA_STOCK_MAIN_INDEX) elif region == Region.US: for item in US_STOCK_MAIN_INDEX: item['timestamp'] = to_pd_timestamp(item['timestamp']) df = pd.DataFrame(US_STOCK_MAIN_INDEX) else: print("index not initialized, in file: init_main_index") df = pd.DataFrame() if pd_valid(df): await df_to_db(region=region, provider=provider, data_schema=Index, db_session=get_db_session(region, provider, Index), df=df)
def common_filter(query: Query, data_schema, ids: List[str] = None, entity_ids: List[str] = None, entity_id: str = None, codes: List[str] = None, code: str = None, start_timestamp=None, end_timestamp=None, filters=None, order=None, limit: int = None, time_field='timestamp'): assert data_schema is not None if entity_id is not None: query = query.filter(data_schema.entity_id == entity_id) if entity_ids is not None: query = query.filter(data_schema.entity_id.in_(entity_ids)) if code is not None: query = query.filter(data_schema.code == code) if codes is not None: query = query.filter(data_schema.code.in_(codes)) if ids is not None: query = query.filter(data_schema.id.in_(ids)) time_col = eval(f'data_schema.{time_field}') if start_timestamp: query = query.filter(time_col >= to_pd_timestamp(start_timestamp)) if end_timestamp: query = query.filter(time_col <= to_pd_timestamp(end_timestamp)) if filters: for filter in filters: query = query.filter(filter) if order is not None: query = query.order_by(order) if limit: query = query.limit(limit) return query
def to_report_period_type(report_date): the_date = to_pd_timestamp(report_date) if the_date.month == 3 and the_date.day == 31: return ReportPeriod.season1.value if the_date.month == 6 and the_date.day == 30: return ReportPeriod.half_year.value if the_date.month == 9 and the_date.day == 30: return ReportPeriod.season3.value if the_date.month == 12 and the_date.day == 31: return ReportPeriod.year.value return None
def to_jq_report_period(timestamp): the_date = to_pd_timestamp(timestamp) report_period = to_report_period_type(timestamp) if report_period == ReportPeriod.year.value: return f'{the_date.year}' if report_period == ReportPeriod.season1.value: return f'{the_date.year}q1' if report_period == ReportPeriod.half_year.value: return f'{the_date.year}q2' if report_period == ReportPeriod.season3.value: return f'{the_date.year}q3' assert False
def format(self, resp, exchange): df = None if exchange == ChnExchange.SSE.value: # df = pd.read_excel(io.BytesIO(resp.content), sheet_name='主板A股', dtype=str, parse_dates=['上市日期']) df = pd.read_csv(io.BytesIO(resp), sep='\t', encoding='GB2312', dtype=str, parse_dates=['上市日期']) if df is not None: df.columns = [column.strip() for column in df.columns] df = df.loc[:, ['公司代码', '公司简称', '上市日期']] elif exchange == ChnExchange.SZSE.value: df = pd.read_excel(io.BytesIO(resp), sheet_name='A股列表', dtype=str, parse_dates=['A股上市日期']) if df is not None: df = df.loc[:, ['A股代码', 'A股简称', 'A股上市日期']] if df is not None: df.columns = ['code', 'name', 'list_date'] timestamp_str = self.get_original_time_field() # handle the dirty data # 600996,贵广网络,2016-12-26,2016-12-26,sh,stock,stock_sh_600996,,次新股,贵州,, df.loc[df['code'] == '600996', timestamp_str] = '2016-12-26' # print(df[df['list_date'] == '-']) df[timestamp_str] = df[timestamp_str].apply(lambda x: to_pd_timestamp(x)) df.fillna({timestamp_str: to_pd_timestamp('1980-01-01')}, inplace=True) df['timestamp'] = df[timestamp_str] df['entity_type'] = EntityType.Stock.value df['exchange'] = exchange df['is_active'] = True df['code'] = df['code'].str.strip() df['id'] = self.generate_domain_id(exchange, df) df['entity_id'] = df['id'] df = df.drop_duplicates(subset=('id'), keep='last') return df
def init_timestamps(self, entity, http_session): param = {"color": "w", "fc": get_fc(entity)} timestamp_json_list = call_eastmoney_api( http_session, url=self.timestamps_fetching_url, path_fields=self.timestamp_list_path_fields, params=param) if self.timestamp_path_fields and timestamp_json_list: timestamps = [ get_from_path_fields(data, self.timestamp_path_fields) for data in timestamp_json_list ] return [to_pd_timestamp(t) for t in timestamps] return []
def get_recent_report_date(the_date, step=0): the_date = to_pd_timestamp(the_date) assert step >= 0 if the_date.month >= 10: recent = f"{the_date.year}{'-09-30'}" elif the_date.month >= 7: recent = f"{the_date.year}{'-06-30'}" elif the_date.month >= 4: recent = f"{the_date.year}{'-03-31'}" else: recent = f"{the_date.year - 1}{'-12-31'}" if step == 0: return recent else: step = step - 1 return get_recent_report_date(recent, step)
async def get_portfolio_stocks(region: Region, provider: Provider, timestamp, portfolio_entity=Fund, code=None, codes=None, ids=None): portfolio_stock = f'{portfolio_entity.__name__}Stock' data_schema: PortfolioStockHistory = get_schema_by_name(portfolio_stock) db_session = get_db_session(region, provider, data_schema) latests, column_names = data_schema.query_data( region=region, provider=provider, db_session=db_session, code=code, end_timestamp=timestamp, order=data_schema.timestamp.desc(), limit=1) if latests and len(latests) > 0: latest_record = latests[0] # 获取最新的报表 data, column_names = data_schema.query_data( region=region, provider=provider, db_session=db_session, code=code, codes=codes, ids=ids, end_timestamp=timestamp, filters=[data_schema.report_date == latest_record.report_date]) if data and len(data) > 0: df = pd.DataFrame([s.__dict__ for s in data], columns=column_names) # 最新的为年报或者半年报 if latest_record.report_period == ReportPeriod.year or latest_record.report_period == ReportPeriod.half_year: return df # 季报,需要结合 年报或半年报 来算持仓 else: step = 0 while step <= 20: report_date = get_recent_report_date( latest_record.report_date, step=step) data, column_names = data_schema.query_data( region=region, provider=provider, db_session=db_session, code=code, codes=codes, ids=ids, end_timestamp=timestamp, filters=[ data_schema.report_date == to_pd_timestamp( report_date) ]) if data and len(data) > 0: pre_df = pd.DataFrame.from_records( [s.__dict__ for s in data], columns=column_names) df = df.append(pre_df) # 半年报和年报 if (ReportPeriod.half_year.value in pre_df['report_period'].tolist()) or ( ReportPeriod.year.value in pre_df['report_period'].tolist()): # 保留最新的持仓 df = df.drop_duplicates(subset=['stock_code'], keep='first') return df step = step + 1
def __init__(self, region: Region, data_schema: Type[Mixin], entity_schema: Type[EntityMixin], provider: Provider = None, entity_ids: List[str] = None, exchanges: List[str] = None, codes: List[str] = None, the_timestamp: Union[str, pd.Timestamp] = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None, columns: List = None, filters: List = None, order: object = None, limit: int = None, level: IntervalLevel = None, category_field: str = 'entity_id', time_field: str = 'timestamp', computing_window: int = None) -> None: self.logger = logging.getLogger(self.__class__.__name__) self.data_schema = data_schema self.entity_schema = entity_schema self.region = region self.provider = provider if end_timestamp is None: end_timestamp = now_pd_timestamp(self.region) self.the_timestamp = the_timestamp if the_timestamp: self.start_timestamp = the_timestamp self.end_timestamp = the_timestamp else: self.start_timestamp = start_timestamp self.end_timestamp = end_timestamp self.start_timestamp = to_pd_timestamp(self.start_timestamp) self.end_timestamp = to_pd_timestamp(self.end_timestamp) self.exchanges = exchanges if codes: if type(codes) == str: codes = codes.replace(' ', '') if codes.startswith('[') and codes.endswith(']'): codes = json.loads(codes) else: codes = codes.split(',') self.codes = codes self.entity_ids = entity_ids self.filters = filters self.order = order self.limit = limit if level: self.level = IntervalLevel(level) else: self.level = level self.category_field = category_field self.time_field = time_field self.computing_window = computing_window self.category_col = eval(f'self.data_schema.{self.category_field}') self.time_col = eval(f'self.data_schema.{self.time_field}') self.columns = columns # we store the data in a multiple index(category_column,timestamp) Dataframe if self.columns: # support str if type(columns[0]) == str: self.columns = [] for col in columns: self.columns.append(eval(f'data_schema.{col}')) # always add category_column and time_field for normalizing self.columns = list( set(self.columns) | {self.category_col, self.time_col}) self.data_listeners: List[DataListener] = [] self.data_df: pd.DataFrame = None