def on_finish_entity(self, entity): super().on_finish_entity(entity) if not self.fetch_jq_timestamp: return # fill the timestamp for report published date the_data_list = get_data( data_schema=self.data_schema, provider=self.provider, entity_id=entity.id, order=self.data_schema.timestamp.asc(), return_type='domain', session=self.session, filters=[ self.data_schema.timestamp == self.data_schema.report_date, self.data_schema.timestamp >= to_pd_timestamp('2005-01-01') ]) if the_data_list: if self.data_schema == FinanceFactor: for the_data in the_data_list: self.fill_timestamp_with_jq(entity, the_data) else: df = FinanceFactor.query_data( entity_id=entity.id, columns=[ FinanceFactor.timestamp, FinanceFactor.report_date, FinanceFactor.id ], filters=[ FinanceFactor.timestamp != FinanceFactor.report_date, FinanceFactor.timestamp >= to_pd_timestamp('2005-01-01'), FinanceFactor.report_date >= the_data_list[0].report_date, FinanceFactor.report_date <= the_data_list[-1].report_date, ]) if pd_is_not_null(df): index_df(df, index='report_date', time_field='report_date') for the_data in the_data_list: if (df is not None) and ( not df.empty) and the_data.report_date in df.index: the_data.timestamp = df.at[the_data.report_date, 'timestamp'] self.logger.info( 'db fill {} {} timestamp:{} for report_date:{}'. format(self.data_schema, entity.id, the_data.timestamp, the_data.report_date)) self.session.commit() else: # self.logger.info( # 'waiting jq fill {} {} timestamp:{} for report_date:{}'.format(self.data_schema, # security_item.id, # the_data.timestamp, # the_data.report_date)) self.fill_timestamp_with_jq(entity, the_data)
def get_referenced_saved_record(self, entity): return get_data(region=self.region, filters=[HkHolder.holder_code == entity.code], provider=self.provider, data_schema=self.data_schema, columns=['id', self.get_evaluated_time_field()], return_type='df')
def load_factor(self): # read state states: List[FactorState] = FactorState.query_data( filters=[FactorState.factor_name == self.factor_name], entity_ids=self.entity_ids, return_type='domain') if states: for state in states: self.states[state.entity_id] = self.decode_state(state.state) if self.dry_run: # 如果只是为了计算因子,只需要读取acc_window的factor_df if self.accumulator is not None: self.factor_df = self.load_window_df( provider='zvt', data_schema=self.factor_schema, window=self.accumulator.acc_window) else: self.factor_df = get_data( provider='zvt', data_schema=self.factor_schema, start_timestamp=self.start_timestamp, entity_ids=self.entity_ids, end_timestamp=self.end_timestamp, index=[self.category_field, self.time_field]) col_map_object_hook = self.factor_col_map_object_hook() if pd_is_not_null(self.factor_df) and col_map_object_hook: for col in col_map_object_hook: if col in self.factor_df.columns: self.factor_df[col] = self.factor_df[col].apply( lambda x: json.loads( x, object_hook=col_map_object_hook.get(col)) if x else None)
def get_latest_saved_record(self, entity): # step = time.time() order = eval('self.data_schema.{}.desc()'.format(self.get_evaluated_time_field())) # self.logger.info("get order: {}".format(time.time()-step)) # 对于k线这种数据,最后一个记录有可能是没完成的,所以取两个,总是删掉最后一个数据,更新之 # self.logger.info("record info: {}, {}, {}".format(entity.id, order, self.level)) records = get_data(region=self.region, entity_id=entity.id, provider=self.provider, data_schema=self.data_schema, order=order, limit=2, return_type='domain', session=self.session, level=self.level) # self.logger.info("get record: {}".format(time.time()-step)) if records: # delete unfinished kdata if len(records) == 2: if is_in_same_interval(t1=records[0].timestamp, t2=records[1].timestamp, level=self.level): self.session.delete(records[0]) self.session.flush() return records[1] return records[0] return None
def get_position(region: Region, trader_name=None, return_type='df', start_timestamp=None, end_timestamp=None, filters=None, session=None, order=None, limit=None): if trader_name: if filters: filters = filters + [Position.trader_name == trader_name] else: filters = [Position.trader_name == trader_name] return get_data(region=region, data_schema=Position, entity_id=None, codes=None, level=None, provider=Provider.ZVT, columns=None, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def get_trader_info(trader_name=None, return_type='df', start_timestamp=None, end_timestamp=None, filters=None, session=None, order=None, limit=None) -> List[trader_info.TraderInfo]: if trader_name: if filters: filters = filters + [ trader_info.TraderInfo.trader_name == trader_name ] else: filters = [trader_info.TraderInfo.trader_name == trader_name] return get_data(data_schema=trader_info.TraderInfo, entity_id=None, codes=None, level=None, provider='zvt', columns=None, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def load_factor(self): if self.dry_run: #: 如果只是为了计算因子,只需要读取acc_window的factor_df if self.accumulator is not None: self.factor_df = self.load_window_df( provider="zvt", data_schema=self.factor_schema, window=self.accumulator.acc_window) else: self.factor_df = get_data( provider="zvt", data_schema=self.factor_schema, start_timestamp=self.start_timestamp, entity_ids=self.entity_ids, end_timestamp=self.end_timestamp, index=[self.category_field, self.time_field], ) col_map_object_hook = self.factor_col_map_object_hook() if pd_is_not_null(self.factor_df) and col_map_object_hook: for col in col_map_object_hook: if col in self.factor_df.columns: self.factor_df[col] = self.factor_df[col].apply( lambda x: json.loads( x, object_hook=col_map_object_hook.get(col)) if x else None)
def record(self, entity, start, end, size, timestamps): try: industry_stocks = get_industry_stocks(entity.code, date=now_pd_timestamp()) except: industry_stocks = get_concept_stocks(entity.code, date=now_pd_timestamp()) if len(industry_stocks) == 0: return None df = pd.DataFrame({"stock": industry_stocks}) df["stock_id"] = df.stock.apply(lambda x: to_entity_id(x, "stock")) df["stock_code"] = df.stock_id.str.split("_", expand=True)[2] df["stock_name"] = df.stock_id.apply(lambda x:get_data(data_schema=Stock, entity_id=x, provider='joinquant').name) df["block_type"] = entity.block_type df["code"] = entity.code df["name"] = entity.name df["exchange"] = entity.exchange df["timestamp"] = now_pd_timestamp() df["entity_id"] = entity.id df["entity_type"] = "block" df["id"] = df.apply(lambda x: x.entity_id + "_" + x.stock_id, axis=1) if df.empty: return None df_to_db(data_schema=self.data_schema, df=df, provider=self.provider, force_update=True) self.logger.info('finish recording BlockStock:{},{}'.format(entity.category, entity.name))
def get_referenced_saved_record(self, entity): return get_data(region=self.region, entity_id=entity.id, provider=self.provider, data_schema=self.data_schema, columns=['id', self.get_evaluated_time_field()], return_type='df')
def get_latest_saved_record(self, entity): order = eval("self.data_schema.{}.desc()".format( self.get_evaluated_time_field())) #: 对于k线这种数据,最后一个记录有可能是没完成的,所以取两个 #: 同一周期内只保留最新的一个数据 records = get_data( entity_id=entity.id, provider=self.provider, data_schema=self.data_schema, order=order, limit=2, return_type="domain", session=self.session, level=self.level, ) if records: #: delete unfinished kdata if len(records) == 2: if is_in_same_interval(t1=records[0].timestamp, t2=records[1].timestamp, level=self.level): self.session.delete(records[1]) self.session.flush() return records[0] return None
def generate_domain(self, entity, original_data): """ generate the data_schema instance using entity and original_data,the original_data is from record result :param entity: :param original_data: """ got_new_data = False #: if the domain is directly generated in record method, we just return it if isinstance(original_data, self.data_schema): got_new_data = True return got_new_data, original_data the_id = self.generate_domain_id(entity, original_data) #: optional way #: item = self.session.query(self.data_schema).get(the_id) items = get_data( data_schema=self.data_schema, session=self.session, provider=self.provider, entity_id=entity.id, filters=[self.data_schema.id == the_id], return_type="domain", ) if items and not self.force_update: self.logger.info("ignore the data {}:{} saved before".format( self.data_schema, the_id)) return got_new_data, None if not items: timestamp_str = original_data[self.get_original_time_field()] timestamp = None try: timestamp = to_pd_timestamp(timestamp_str) except Exception as e: self.logger.exception(e) if "name" in get_schema_columns(self.data_schema): domain_item = self.data_schema(id=the_id, code=entity.code, name=entity.name, entity_id=entity.id, timestamp=timestamp) else: domain_item = self.data_schema(id=the_id, code=entity.code, entity_id=entity.id, timestamp=timestamp) got_new_data = True else: domain_item = items[0] fill_domain_from_dict(domain_item, original_data, self.get_data_map()) return got_new_data, domain_item
def get_account_stats(trader_name=None, return_type='df', start_timestamp=None, end_timestamp=None, filters=None, session=None, order=None, limit=None): if trader_name: if filters: filters = filters + [AccountStats.trader_name == trader_name] else: filters = [AccountStats.trader_name == trader_name] return get_data(data_schema=AccountStats, entity_id=None, codes=None, level=None, provider='zvt', columns=None, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def get_latest_saved_record(self, entity): order = eval('self.data_schema.{}.desc()'.format(self.get_evaluated_time_field())) records = get_data(entity_id=entity.id, provider=self.provider, data_schema=self.data_schema, order=order, limit=1, return_type='domain', session=self.session) if records: return records[0] return None
def get_latest_saved_record(self, entity): order = eval('self.data_schema.{}.desc()'.format( self.get_evaluated_time_field())) records = get_data(region=self.region, filters=[HkHolder.holder_code == entity.code], provider=self.provider, data_schema=self.data_schema, order=order, limit=1, return_type='domain', session=self.session) if records: return records[0] return None
def evaluate_start_end_size_timestamps(self, entity): # get latest record latest_record = get_data(entity_id=entity.id, provider=self.provider, data_schema=self.data_schema, order=self.data_schema.timestamp.desc(), limit=1, return_type='domain', session=self.session) if latest_record: remote_record = self.get_remote_latest_record(entity) if not remote_record or ( latest_record[0].id == remote_record.id): return None, None, 0, None else: return None, None, 10, None return None, None, 1000, None
def __init__(self, data_schema: Type[Mixin], entity_schema: Type[TradableEntity] = None, provider: str = None, entity_provider: str = None, entity_ids: List[str] = None, exchanges: List[str] = None, codes: List[str] = None, the_timestamp: Union[str, pd.Timestamp] = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None, columns: List = None, filters: List = None, order: object = None, limit: int = None, level: Union[str, IntervalLevel] = None, category_field: str = 'entity_id', time_field: str = 'timestamp', computing_window: int = None, # child added arguments keep_all_timestamp: bool = False, fill_method: str = 'ffill', effective_number: int = None, transformer: Transformer = None, accumulator: Accumulator = None, need_persist: bool = False, dry_run: bool = False, factor_name: str = None, clear_state: bool = False, not_load_data: bool = False) -> None: """ :param computing_window: the window size for computing factor :param keep_all_timestamp: whether fill all timestamp gap,default False :param fill_method: :param effective_number: :param transformer: :param accumulator: :param need_persist: whether persist factor :param dry_run: True for just computing factor, False for backtesting """ self.not_load_data = not_load_data super().__init__(data_schema, entity_schema, provider, entity_provider, entity_ids, exchanges, codes, the_timestamp, start_timestamp, end_timestamp, columns, filters, order, limit, level, category_field, time_field, computing_window) # define unique name of your factor if you want to keep factor state # the factor state is defined by factor_name and entity_id if not factor_name: self.factor_name = type(self).__name__.lower() else: self.factor_name = factor_name self.clear_state = clear_state self.keep_all_timestamp = keep_all_timestamp self.fill_method = fill_method self.effective_number = effective_number if transformer: self.transformer = transformer else: self.transformer = self.__class__.transformer if accumulator: self.accumulator = accumulator else: self.accumulator = self.__class__.accumulator self.need_persist = need_persist self.dry_run = dry_run # 中间结果,不持久化 # data_df->pipe_df self.pipe_df: pd.DataFrame = None # 计算因子的结果,可持久化,通过对pipe_df的计算得到 # pipe_df->factor_df self.factor_df: pd.DataFrame = None # result_df是用于选股的标准df,通过对factor_df的计算得到 # factor_df->result_df self.result_df: pd.DataFrame = None # entity_id:state self.states: dict = {} if self.clear_state: self.clear_state_data() elif self.need_persist: self.load_factor() # 根据已经计算的factor_df和computing_window来保留data_df # 因为读取data_df的目的是为了计算factor_df,选股和回测只依赖factor_df # 所以如果有持久化的factor_df,只需保留需要用于计算的data_df即可 if pd_is_not_null(self.data_df) and self.computing_window: dfs = [] for entity_id, df in self.data_df.groupby(level=0): latest_laved = get_data(provider='zvt', data_schema=self.factor_schema, entity_id=entity_id, order=self.factor_schema.timestamp.desc(), limit=1, index=[self.category_field, self.time_field], return_type='domain') if latest_laved: df1 = df[df.timestamp < latest_laved[0].timestamp].iloc[-self.computing_window:] if pd_is_not_null(df1): df = df[df.timestamp >= df1.iloc[0].timestamp] dfs.append(df) self.data_df = pd.concat(dfs) self.register_data_listener(self) # the compute logic is not triggered from load data # for the case:1)load factor from db 2)compute the result if self.not_load_data: self.compute()
def __init__( self, data_schema: Type[Mixin], entity_schema: Type[TradableEntity] = None, provider: str = None, entity_provider: str = None, entity_ids: List[str] = None, exchanges: List[str] = None, codes: List[str] = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None, columns: List = None, filters: List = None, order: object = None, limit: int = None, level: Union[str, IntervalLevel] = IntervalLevel.LEVEL_1DAY, category_field: str = "entity_id", time_field: str = "timestamp", computing_window: int = None, keep_all_timestamp: bool = False, fill_method: str = "ffill", effective_number: int = None, transformer: Transformer = None, accumulator: Accumulator = None, need_persist: bool = False, only_compute_factor: bool = False, factor_name: str = None, clear_state: bool = False, only_load_factor: bool = False, ) -> None: """ :param keep_all_timestamp: :param fill_method: :param effective_number: :param transformer: :param accumulator: :param need_persist: whether persist factor :param only_compute_factor: only compute factor nor result :param factor_name: :param clear_state: :param only_load_factor: only load factor and compute result """ self.only_load_factor = only_load_factor #: define unique name of your factor if you want to keep factor state #: the factor state is defined by factor_name and entity_id if not factor_name: self.name = to_snake_str(type(self).__name__) else: self.name = factor_name DataReader.__init__( self, data_schema, entity_schema, provider, entity_provider, entity_ids, exchanges, codes, start_timestamp, end_timestamp, columns, filters, order, limit, level, category_field, time_field, computing_window, ) EntityStateService.__init__(self, entity_ids=entity_ids) self.clear_state = clear_state self.keep_all_timestamp = keep_all_timestamp self.fill_method = fill_method self.effective_number = effective_number if transformer: self.transformer = transformer else: self.transformer = self.__class__.transformer if accumulator: self.accumulator = accumulator else: self.accumulator = self.__class__.accumulator self.need_persist = need_persist self.dry_run = only_compute_factor #: 中间结果,不持久化 #: data_df->pipe_df self.pipe_df: pd.DataFrame = None #: 计算因子的结果,可持久化,通过对pipe_df的计算得到 #: pipe_df->factor_df self.factor_df: pd.DataFrame = None #: result_df是用于选股的标准df,通过对factor_df的计算得到 #: factor_df->result_df self.result_df: pd.DataFrame = None if self.clear_state: self.clear_state_data() elif self.need_persist or self.only_load_factor: self.load_factor() #: 根据已经计算的factor_df和computing_window来保留data_df #: 因为读取data_df的目的是为了计算factor_df,选股和回测只依赖factor_df #: 所以如果有持久化的factor_df,只需保留需要用于计算的data_df即可 if pd_is_not_null(self.data_df) and self.computing_window: dfs = [] for entity_id, df in self.data_df.groupby(level=0): latest_laved = get_data( provider="zvt", data_schema=self.factor_schema, entity_id=entity_id, order=self.factor_schema.timestamp.desc(), limit=1, index=[self.category_field, self.time_field], return_type="domain", ) if latest_laved: df1 = df[df.timestamp < latest_laved[0]. timestamp].iloc[-self.computing_window:] if pd_is_not_null(df1): df = df[df.timestamp >= df1.iloc[0].timestamp] dfs.append(df) self.data_df = pd.concat(dfs) self.register_data_listener(self) #: the compute logic is not triggered from load data #: for the case:1)load factor from db 2)compute the result if self.only_load_factor: self.compute()
def record2(self, entity, start, end, size, timestamps): if not end: end = to_time_str(now_pd_timestamp()) if (pd.to_datetime(end) - start).days >= 30: from datetime import timedelta end = to_time_str(start + timedelta(days=30)) start = to_time_str(start) if start == end: return None # 暂不处理港股 if 'hk' in entity.id: return None exchange = 'SH' if 'sh' in entity.id else 'SZ' em_code = entity.code + '.' + exchange columns_list = { 'TOTALSHARE': 'capitalization', # 总股本 'LIQSHARE': 'circulating_cap', # 流通股本 'MV': 'market_cap', # 总市值 'LIQMV': 'circulating_market_cap', # 流通市值 'TURN': 'turnover_ratio', # 换手率 'PELYR': 'pe', # 静态pe 'PETTM': 'pe_ttm', # 动态pe 'PBLYR': 'pb', # 市净率PB(最新年报) 'PBMRQ': 'pb_mrq', # 市净率PB(MRQ) 'PSTTM': 'ps_ttm', # 市销率PS(TTM) 'PCFTTM': 'pcf_ttm', # 市现率PCF(最新年报,经营性现金流) } # df = c.csd(em_code, [i for i in columns_list.keys()], start,end,"ispandas=1,DelType=2") df = get_data(data_schema=StockValuation, entity_id=entity.id, provider='joinquant', start_timestamp=start, end_timestamp=end) if df.empty: df = get_data(data_schema=StockValuation, entity_id=entity.id, provider='joinquant', limit=1) start = df.timestamp[0] end = to_time_str(start + timedelta(days=30)) df = get_data(data_schema=StockValuation, entity_id=entity.id, provider='joinquant', start_timestamp=start, end_timestamp=end) if df.empty: return None df.rename(columns={ "ps": "ps_ttm", "pcf": "pcf_ttm", }, inplace=True) trade_day = StockTradeDay.query_data(order=StockTradeDay.timestamp.desc(), start_timestamp=start, end_timestamp=end) df_capital_all = pd.DataFrame() for tradeday in trade_day.timestamp: df_capital = c.css(em_code, "WACC,DIVIDENDYIELDNEW", f"TradeDate={to_time_str(tradeday)},FrIndex=1,MrIndex=1,ispandas=1") try: df_capital['DATES'] = tradeday except: continue df_capital_all = df_capital_all.append(df_capital) # 'DIVIDENDYIELDNEW': 'div_yield', #股息率 try: if df.empty: return None except: self.logger.info(f'choice数据源的个股估值尚未准备完成,获取失败。' f'股票代码:{em_code}-开始时间:{start}-结束时间:{end}') return None df['CODES'] = df_capital_all.index[0] df['DATES'] = df['timestamp'] df_capital_all['DATES'] = pd.to_datetime(df_capital_all['DATES']) df_capital_all.rename(columns={"DIVIDENDYIELDNEW": "div_yield", "WACC": "wacc"}, inplace=True) df = pd.merge(df, df_capital_all, on=['CODES', 'DATES'], how='outer') df.dropna(subset=['id'],inplace=True) df_to_db(df=df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) return None
def __init__( self, data_schema: Type[Mixin], entity_schema: Type[EntityMixin] = None, provider: str = None, entity_provider: str = None, entity_ids: List[str] = None, exchanges: List[str] = None, codes: List[str] = None, the_timestamp: Union[str, pd.Timestamp] = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None, columns: List = None, filters: List = None, order: object = None, limit: int = None, level: Union[str, IntervalLevel] = None, category_field: str = 'entity_id', time_field: str = 'timestamp', computing_window: int = None, # child added arguments keep_all_timestamp: bool = False, fill_method: str = 'ffill', effective_number: int = None, transformer: Transformer = None, accumulator: Accumulator = None, need_persist: bool = False, dry_run: bool = False) -> None: """ :param computing_window: the window size for computing factor :param keep_all_timestamp: whether fill all timestamp gap,default False :param fill_method: :param effective_number: :param transformer: :param accumulator: :param need_persist: whether persist factor :param dry_run: True for just computing factor, False for backtesting """ super().__init__(data_schema, entity_schema, provider, entity_provider, entity_ids, exchanges, codes, the_timestamp, start_timestamp, end_timestamp, columns, filters, order, limit, level, category_field, time_field, computing_window) self.factor_name = type(self).__name__.lower() self.keep_all_timestamp = keep_all_timestamp self.fill_method = fill_method self.effective_number = effective_number self.transformer = transformer self.accumulator = accumulator self.need_persist = need_persist self.dry_run = dry_run # 中间结果,不持久化 # data_df->pipe_df self.pipe_df: pd.DataFrame = None # 计算因子的结果,可持久化,通过对pipe_df的计算得到 # pipe_df->factor_df self.factor_df: pd.DataFrame = None # result_df是用于选股的标准df,通过对factor_df的计算得到 # factor_df->result_df self.result_df: pd.DataFrame = None # the feature of persisting factor is not good yet,may change the latter if self.need_persist: if self.dry_run: # 如果只是为了计算因子,只需要读取acc_window的factor_df if self.accumulator is not None: self.factor_df = self.load_window_df( provider='zvt', data_schema=self.factor_schema, window=accumulator.acc_window) else: self.factor_df = get_data( provider='zvt', data_schema=self.factor_schema, start_timestamp=self.start_timestamp, end_timestamp=self.end_timestamp, index=[self.category_field, self.time_field]) # 根据已经计算的factor_df和computing_window来保留data_df # 因为读取data_df的目的是为了计算factor_df,选股和回测只依赖factor_df # 所以如果有持久化的factor_df,只需保留需要用于计算的data_df即可 if pd_is_not_null(self.data_df) and self.computing_window: dfs = [] for entity_id, df in self.data_df.groupby(level=0): latest_laved = get_data( provider='zvt', data_schema=self.factor_schema, entity_id=entity_id, order=self.factor_schema.timestamp.desc(), limit=1, index=[self.category_field, self.time_field], return_type='domain') if latest_laved: df1 = df[df.timestamp < latest_laved[0]. timestamp].iloc[-self.computing_window:] if pd_is_not_null(df1): df = df[df.timestamp >= df1.iloc[0].timestamp] dfs.append(df) self.data_df = pd.concat(dfs) self.register_data_listener(self)