def on_finish_entity(self, entity): super().on_finish_entity(entity) if not self.fetch_jq_timestamp: return # fill the timestamp for report published date the_data_list = get_data( data_schema=self.data_schema, provider=self.provider, entity_id=entity.id, order=self.data_schema.timestamp.asc(), return_type='domain', session=self.session, filters=[ self.data_schema.timestamp == self.data_schema.report_date, self.data_schema.timestamp >= to_pd_timestamp('2005-01-01') ]) if the_data_list: if self.data_schema == FinanceFactor: for the_data in the_data_list: self.fill_timestamp_with_jq(entity, the_data) else: df = FinanceFactor.query_data( entity_id=entity.id, columns=[ FinanceFactor.timestamp, FinanceFactor.report_date, FinanceFactor.id ], filters=[ FinanceFactor.timestamp != FinanceFactor.report_date, FinanceFactor.timestamp >= to_pd_timestamp('2005-01-01'), FinanceFactor.report_date >= the_data_list[0].report_date, FinanceFactor.report_date <= the_data_list[-1].report_date, ]) if pd_is_not_null(df): index_df(df, index='report_date', time_field='report_date') for the_data in the_data_list: if (df is not None) and ( not df.empty) and the_data.report_date in df.index: the_data.timestamp = df.at[the_data.report_date, 'timestamp'] self.logger.info( 'db fill {} {} timestamp:{} for report_date:{}'. format(self.data_schema, entity.id, the_data.timestamp, the_data.report_date)) self.session.commit() else: # self.logger.info( # 'waiting jq fill {} {} timestamp:{} for report_date:{}'.format(self.data_schema, # security_item.id, # the_data.timestamp, # the_data.report_date)) self.fill_timestamp_with_jq(entity, the_data)
def run(self): if self.filter_factors: musts = [] for factor in self.filter_factors: df = factor.get_result_df() if len(df.columns) > 1: s = df.agg("and", axis="columns") s.name = 'score' musts.append(s.to_frame(name='score')) else: df.columns = ['score'] musts.append(df) self.must_result = list(accumulate(musts, func=operator.__and__))[-1] if self.score_factors: scores = [] for factor in self.score_factors: df = factor.get_result_df() if len(df.columns) > 1: s = df.agg("mean", axis="columns") s.name = 'score' scores.append(s.to_frame(name='score')) else: df.columns = ['score'] scores.append(df) self.score_result = list(accumulate(scores, func=operator.__add__))[-1] if df_is_not_null(self.must_result) and df_is_not_null( self.score_result): result1 = self.must_result[self.must_result.score] result2 = self.score_result[ self.score_result.score >= self.threshold] result = result2.loc[result1.index, :] elif df_is_not_null(self.score_result): result = self.score_result[ self.score_result.score >= self.threshold] else: result = self.must_result[self.must_result.score] self.result_df = result.reset_index() self.result_df = index_df(self.result_df)
factor = TSIFactor(entity_schema=Coin, entity_ids=entity_ids, provider='ccxt', level=IntervalLevel.LEVEL_1DAY, start_timestamp=start_date, need_persist=False) df = factor.result_df musts = [] if len(df.columns) > 1: s = df.agg("and", axis="columns") s.name = 'score' musts.append(s.to_frame(name='score')) else: df.columns = ['score'] musts.append(df) signal_in_last_n_day_num = 14 filter_result = list(accumulate(musts, func=operator.__and__))[-1] long_result = df[df.score == True] long_result = long_result.reset_index() long_result = index_df(long_result) long_result = long_result.sort_values(by=['score', 'entity_id']) long_result = long_result[long_result.timestamp > target_date - timedelta(signal_in_last_n_day_num)] longdf = factor.factor_df[factor.factor_df['entity_id'].isin(long_result['entity_id'].tolist())] good_coins = set(long_result['entity_id'].tolist()) coins = get_entities(provider='ccxt', entity_schema=Coin, entity_ids=good_coins, return_type='domain') codeList = [] for coin in coins: codeList.append(to_tradingview_code(coin.code, coin.exchange)) info = [f'{coin}' for coin in codeList] msg = '选币:' + ' '.join(info) + '\n' logger.info(msg) # add_list_to_group(codeList, group_id=19580865, entity_type='coin')
def get_data(data_schema, security_list=None, security_id=None, codes=None, level=None, provider='eastmoney', columns=None, return_type='df', start_timestamp=None, end_timestamp=None, filters=None, session=None, order=None, limit=None, index='timestamp', index_is_time=True): local_session = False if not session: store_category = get_store_category(data_schema) session = get_db_session(provider=provider, store_category=store_category) local_session = True try: if columns: if data_schema.timestamp not in columns: columns.append(data_schema.timestamp) query = session.query(*columns) else: query = session.query(data_schema) if security_id: query = query.filter(data_schema.security_id == security_id) if codes: query = query.filter(data_schema.code.in_(codes)) if security_list: query = query.filter(data_schema.security_id.in_(security_list)) # we always store different level in different schema,the level param is not useful now if level: try: # some schema has no level,just ignore it data_schema.level if type(level) == TradingLevel: level = level.value query = query.filter(data_schema.level == level) except Exception as e: pass query = common_filter(query, data_schema=data_schema, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, order=order, limit=limit) if return_type == 'df': df = pd.read_sql(query.statement, query.session.bind) if df_is_not_null(df): return index_df(df, drop=False, index=index, index_is_time=index_is_time) elif return_type == 'domain': return query.all() elif return_type == 'dict': return [item.__dict__ for item in query.all()] except Exception: raise finally: if local_session: session.close()
def normalize_result_df(self, df): if pd_is_not_null(df): df = df.reset_index() df = index_df(df) df = df.sort_values(by=['score', 'entity_id']) return df
def get_data(data_schema, ids: List[str] = None, entity_ids: List[str] = None, entity_id: str = None, codes: List[str] = None, code: str = None, level: Union[IntervalLevel, str] = None, provider: str = None, columns: List = None, col_label: dict = None, return_type: str = 'df', start_timestamp: Union[pd.Timestamp, str] = None, end_timestamp: Union[pd.Timestamp, str] = None, filters: List = None, session: Session = None, order=None, limit: int = None, index: Union[str, list] = None, time_field: str = 'timestamp'): assert data_schema is not None assert provider is not None assert provider in zvt_context.providers if not session: session = get_db_session(provider=provider, data_schema=data_schema) time_col = eval('data_schema.{}'.format(time_field)) if columns: # support str if type(columns[0]) == str: columns_ = [] for col in columns: assert isinstance(col, str) columns_.append(eval('data_schema.{}'.format(col))) columns = columns_ # make sure get timestamp if time_col not in columns: columns.append(time_col) if col_label: columns_ = [] for col in columns: if col.name in col_label: columns_.append(col.label(col_label.get(col.name))) else: columns_.append(col) columns = columns_ query = session.query(*columns) else: query = session.query(data_schema) if entity_id: query = query.filter(data_schema.entity_id == entity_id) if entity_ids: query = query.filter(data_schema.entity_id.in_(entity_ids)) if code: query = query.filter(data_schema.code == code) if codes: query = query.filter(data_schema.code.in_(codes)) if ids: query = query.filter(data_schema.id.in_(ids)) # we always store different level in different schema,the level param is not useful now if level: try: # some schema has no level,just ignore it data_schema.level if type(level) == IntervalLevel: level = level.value query = query.filter(data_schema.level == level) except Exception as e: pass query = common_filter(query, data_schema=data_schema, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, order=order, limit=limit, time_field=time_field) if return_type == 'df': df = pd.read_sql(query.statement, query.session.bind) if pd_is_not_null(df): if index: df = index_df(df, index=index, time_field=time_field) return df elif return_type == 'domain': return query.all() elif return_type == 'dict': return [item.__dict__ for item in query.all()]
def get_data(region: Region, data_schema, ids: List[str] = None, entity_ids: List[str] = None, entity_id: str = None, codes: List[str] = None, code: str = None, level: Union[IntervalLevel, str] = None, provider: Provider = Provider.Default, columns: List = None, col_label: dict = None, return_type: str = 'df', start_timestamp: Union[pd.Timestamp, str] = None, end_timestamp: Union[pd.Timestamp, str] = None, filters: List = None, session: Session = None, order=None, limit: int = None, index: Union[str, list] = None, time_field: str = 'timestamp', fun=None): assert data_schema is not None assert provider.value is not None assert provider in zvt_context.providers[region] step1 = time.time() precision_str = '{' + ':>{},.{}f'.format(8, 4) + '}' if not session: session = get_db_session(region=region, provider=provider, data_schema=data_schema) time_col = eval('data_schema.{}'.format(time_field)) if fun is not None: query = session.query(fun) elif columns: # support str if type(columns[0]) == str: columns_ = [] for col in columns: assert isinstance(col, str) columns_.append(eval('data_schema.{}'.format(col))) columns = columns_ # make sure get timestamp if time_col not in columns: columns.append(time_col) if col_label: columns_ = [] for col in columns: if col.name in col_label: columns_.append(col.label(col_label.get(col.name))) else: columns_.append(col) columns = columns_ query = session.query(*columns) else: query = session.query(data_schema) if zvt_config['debug'] == 2: cost = precision_str.format(time.time() - step1) logger.debug("get_data query column: {}".format(cost)) if entity_id is not None: query = query.filter(data_schema.entity_id == entity_id) if entity_ids is not None: query = query.filter(data_schema.entity_id.in_(entity_ids)) if code is not None: query = query.filter(data_schema.code == code) if codes is not None: query = query.filter(data_schema.code.in_(codes)) if ids is not None: query = query.filter(data_schema.id.in_(ids)) # we always store different level in different schema,the level param is not useful now # if level: # try: # # some schema has no level,just ignore it # data_schema.level # if type(level) == IntervalLevel: # level = level.value # query = query.filter(data_schema.level == level) # except Exception as _: # pass query = common_filter(query, data_schema=data_schema, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, order=order, limit=limit, time_field=time_field) if zvt_config['debug'] == 2: cost = precision_str.format(time.time() - step1) logger.debug("get_data query common: {}".format(cost)) if return_type == 'func': result = query.scalar() return result elif return_type == 'df': df = pd.read_sql(query.statement, query.session.bind, index_col=['id']) if pd_is_not_null(df): if index: df = index_df(df, index=index, time_field=time_field) if zvt_config['debug'] == 2: cost = precision_str.format(time.time() - step1) logger.debug("get_data do query cost: {} type: {} size: {}".format( cost, return_type, len(df))) return df elif return_type == 'domain': # if limit is not None and limit == 1: # result = [query.first()] # else: # result = list(window_query(query, window_size, step1)) # result = list(query.yield_per(window_size)) if zvt_config['debug'] == 2: with profiled(): result = query.all() else: result = query.all() if zvt_config['debug'] == 2: cost = precision_str.format(time.time() - step1) res_cnt = len(result) if result else 0 logger.debug( "get_data do query cost: {} type: {} limit: {} size: {}". format(cost, return_type, limit, res_cnt)) return result elif return_type == 'dict': # if limit is not None and limit == 1: # result = [item.__dict__ for item in query.first()] # else: # result = [item.__dict__ for item in list(window_query(query, window_size, step1))] # result = [item.__dict__ for item in list(query.yield_per(window_size))] if zvt_config['debug'] == 2: with profiled(): result = [item.__dict__ for item in query.all()] else: result = [item.__dict__ for item in query.all()] if zvt_config['debug'] == 2: cost = precision_str.format(time.time() - step1) res_cnt = len(result) if result else 0 logger.debug( "get_data do query cost: {} type: {} limit: {} size: {}". format(cost, return_type, limit, res_cnt)) return result
def normalize_result_df(self, df): df = df.reset_index() df = index_df(df) df = df.sort_values(by=['score', 'security_id']) return df
def get_data( data_schema: Type[Mixin], ids: List[str] = None, entity_ids: List[str] = None, entity_id: str = None, codes: List[str] = None, code: str = None, level: Union[IntervalLevel, str] = None, provider: str = None, columns: List = None, col_label: dict = None, return_type: str = "df", start_timestamp: Union[pd.Timestamp, str] = None, end_timestamp: Union[pd.Timestamp, str] = None, filters: List = None, session: Session = None, order=None, limit: int = None, index: Union[str, list] = None, drop_index_col=False, time_field: str = "timestamp", ): """ query data by the arguments :param data_schema: :param ids: :param entity_ids: :param entity_id: :param codes: :param code: :param level: :param provider: :param columns: :param col_label: dict with key(column), value(label) :param return_type: df, domain or dict. default is df :param start_timestamp: :param end_timestamp: :param filters: :param session: :param order: :param limit: :param index: index field name, str for single index, str list for multiple index :param drop_index_col: whether drop the col if it's in index, default False :param time_field: :return: results basing on return_type. """ if "providers" not in data_schema.__dict__: logger.error("no provider registered for: {}", data_schema) if not provider: provider = data_schema.providers[0] if not session: session = get_db_session(provider=provider, data_schema=data_schema) time_col = eval("data_schema.{}".format(time_field)) if columns: # support str for i, col in enumerate(columns): if isinstance(col, str): columns[i] = eval("data_schema.{}".format(col)) # make sure get timestamp if time_col not in columns: columns.append(time_col) if col_label: columns_ = [] for col in columns: if col.name in col_label: columns_.append(col.label(col_label.get(col.name))) else: columns_.append(col) columns = columns_ query = session.query(*columns) else: query = session.query(data_schema) if entity_id: query = query.filter(data_schema.entity_id == entity_id) if entity_ids: query = query.filter(data_schema.entity_id.in_(entity_ids)) if code: query = query.filter(data_schema.code == code) if codes: query = query.filter(data_schema.code.in_(codes)) if ids: query = query.filter(data_schema.id.in_(ids)) # we always store different level in different schema,the level param is not useful now if level: try: #: some schema has no level,just ignore it data_schema.level if type(level) == IntervalLevel: level = level.value query = query.filter(data_schema.level == level) except Exception as e: pass query = common_filter( query, data_schema=data_schema, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, order=order, limit=limit, time_field=time_field, ) if return_type == "df": df = pd.read_sql(query.statement, query.session.bind) if pd_is_not_null(df): if index: df = index_df(df, index=index, drop=drop_index_col, time_field=time_field) return df elif return_type == "domain": return query.all() elif return_type == "dict": return [item.__dict__ for item in query.all()]