def generate_targets(self): if pd_is_not_null(self.filter_result) and pd_is_not_null( self.score_result): # for long result1 = self.filter_result[self.filter_result.score] result2 = self.score_result[ self.score_result.score >= self.long_threshold] long_result = result2.loc[result1.index, :] # for short result1 = self.filter_result[~self.filter_result.score] result2 = self.score_result[ self.score_result.score <= self.short_threshold] short_result = result2.loc[result1.index, :] elif pd_is_not_null(self.score_result): long_result = self.score_result[ self.score_result.score >= self.long_threshold] short_result = self.score_result[ self.score_result.score <= self.short_threshold] else: long_result = self.filter_result[self.filter_result.score == True] short_result = self.filter_result[self.filter_result.score == False] # filter in blocks if self.portfolio_selector: if pd_is_not_null(self.portfolio_selector.open_long_df): long_result = long_result[lambda df: self.in_block( long_result, target_type=TargetType.open_long)] if pd_is_not_null(self.portfolio_selector.open_short_df): short_result = short_result[lambda df: self.in_block( short_result, target_type=TargetType.open_short)] self.open_long_df = self.normalize_result_df(long_result) self.open_short_df = self.normalize_result_df(short_result)
def get_trading_signals_figure(order_reader: OrderReader, entity_id: str, provider: str, level): entity_type, _, _ = decode_entity_id(entity_id) security_factor = TechnicalFactor(entity_type=entity_type, entity_ids=[entity_id], level=level, provider=provider) if pd_is_not_null(security_factor.data_df): print(security_factor.data_df.tail()) # generate the annotation df order_reader.move_on(timeout=0) df = order_reader.data_df.copy() if pd_is_not_null(df): df['value'] = df['order_price'] df['flag'] = df['order_type'].apply(lambda x: order_type_flag(x)) df['color'] = df['order_type'].apply(lambda x: order_type_color(x)) print(df.tail()) data, layout = security_factor.draw(render=None, figures=go.Candlestick, annotation_df=df) return go.Figure(data=data, layout=layout)
def do_compute(self): # 无状态的转换运算 if pd_is_not_null(self.data_df) and self.transformer: self.pipe_df = self.transformer.transform(self.data_df) # 有状态的累加运算 if pd_is_not_null(self.pipe_df) and self.accumulator: self.factor_df = self.accumulator.acc(self.pipe_df, self.factor_df) else: self.factor_df = self.pipe_df
def record(self, entity, start, end, size, timestamps): # 只要前复权数据 if not self.end_timestamp: df = get_bars(to_jq_entity_id(entity), count=size, unit=self.jq_trading_level, fields=['date', 'open', 'close', 'low', 'high', 'volume', 'money'], fq_ref_date=to_time_str(now_pd_timestamp()), include_now=True) else: end_timestamp = to_time_str(self.end_timestamp) df = get_bars(to_jq_entity_id(entity), count=size, unit=self.jq_trading_level, fields=['date', 'open', 'close', 'low', 'high', 'volume', 'money'], end_dt=end_timestamp, fq_ref_date=to_time_str(now_pd_timestamp()), include_now=False) if pd_is_not_null(df): df['name'] = entity.name df.rename(columns={'money': 'turnover', 'date': 'timestamp'}, inplace=True) df['entity_id'] = entity.id df['timestamp'] = pd.to_datetime(df['timestamp']) df['provider'] = 'joinquant' df['level'] = self.level.value df['code'] = entity.code # 判断是否需要重新计算之前保存的前复权数据 check_df = df.head(1) check_date = check_df['timestamp'][0] current_df = get_kdata(entity_id=entity.id, provider=self.provider, start_timestamp=check_date, end_timestamp=check_date, limit=1, level=self.level) if pd_is_not_null(current_df): old = current_df.iloc[0, :]['close'] new = check_df['close'][0] # 相同时间的close不同,表明前复权需要重新计算 if round(old, 2) != round(new, 2): self.factor = new / old self.last_timestamp = pd.Timestamp(check_date) def generate_kdata_id(se): if self.level >= IntervalLevel.LEVEL_1DAY: return "{}_{}".format(se['entity_id'], to_time_str(se['timestamp'], fmt=TIME_FORMAT_DAY)) else: return "{}_{}".format(se['entity_id'], to_time_str(se['timestamp'], fmt=TIME_FORMAT_ISO8601)) df['id'] = df[['entity_id', 'timestamp']].apply(generate_kdata_id, axis=1) df_to_db(df=df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) return None
def register_data_listener(self, listener): if listener not in self.data_listeners: self.data_listeners.append(listener) # notify it once after registered if pd_is_not_null(self.data_df): listener.on_data_loaded(self.data_df)
def on_finish_entity(self, entity): super().on_finish_entity(entity) if not self.fetch_jq_timestamp: return # fill the timestamp for report published date the_data_list = get_data( data_schema=self.data_schema, provider=self.provider, entity_id=entity.id, order=self.data_schema.timestamp.asc(), return_type='domain', session=self.session, filters=[ self.data_schema.timestamp == self.data_schema.report_date, self.data_schema.timestamp >= to_pd_timestamp('2005-01-01') ]) if the_data_list: if self.data_schema == FinanceFactor: for the_data in the_data_list: self.fill_timestamp_with_jq(entity, the_data) else: df = get_finance_factor( entity_id=entity.id, columns=[ FinanceFactor.timestamp, FinanceFactor.report_date, FinanceFactor.id ], filters=[ FinanceFactor.timestamp != FinanceFactor.report_date, FinanceFactor.timestamp >= to_pd_timestamp('2005-01-01'), FinanceFactor.report_date >= the_data_list[0].report_date, FinanceFactor.report_date <= the_data_list[-1].report_date, ]) if pd_is_not_null(df): index_df(df, index='report_date', time_field='report_date') for the_data in the_data_list: if (df is not None) and ( not df.empty) and the_data.report_date in df.index: the_data.timestamp = df.at[the_data.report_date, 'timestamp'] self.logger.info( 'db fill {} {} timestamp:{} for report_date:{}'. format(self.data_schema, entity.id, the_data.timestamp, the_data.report_date)) self.session.commit() else: # self.logger.info( # 'waiting jq fill {} {} timestamp:{} for report_date:{}'.format(self.data_schema, # security_item.id, # the_data.timestamp, # the_data.report_date)) self.fill_timestamp_with_jq(entity, the_data)
def record(self, entity, start, end, size, timestamps): q = query(finance.FUND_PORTFOLIO_STOCK).filter(finance.FUND_PORTFOLIO_STOCK.pub_date >= start).filter( finance.FUND_PORTFOLIO_STOCK.code == entity.code) df = finance.run_query(q) if pd_is_not_null(df): # id code period_start period_end pub_date report_type_id report_type rank symbol name shares market_cap proportion # 0 8640569 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 1 601318 中国平安 19869239.0 1.361043e+09 7.09 # 1 8640570 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 2 600519 贵州茅台 921670.0 6.728191e+08 3.50 # 2 8640571 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 3 600036 招商银行 18918815.0 5.806184e+08 3.02 # 3 8640572 159919 2018-07-01 2018-09-30 2018-10-26 403003 第三季度 4 601166 兴业银行 22862332.0 3.646542e+08 1.90 df['timestamp'] = pd.to_datetime(df['pub_date']) df.rename(columns={'symbol': 'stock_code', 'name': 'stock_name'}, inplace=True) df['proportion'] = df['proportion'] * 0.01 df = portfolio_relate_stock(df, entity) df['stock_id'] = df['stock_code'].apply(lambda x: china_stock_code_to_id(x)) df['id'] = df[['entity_id', 'stock_id', 'pub_date', 'id']].apply(lambda x: '_'.join(x.astype(str)), axis=1) df['report_date'] = pd.to_datetime(df['period_end']) df['report_period'] = df['report_type'].apply(lambda x: jq_to_report_period(x)) df_to_db(df=df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) # self.logger.info(df.tail()) self.logger.info(f"persist etf {entity.code} portfolio success") return None
def draw(self, render='html', file_name=None, width=None, height=None, title=None, keep_ui_state=True, annotation_df=None, target_type: TargetType = TargetType.open_long): if target_type == TargetType.open_long: df = self.open_long_df.copy() elif target_type == TargetType.open_short: df = self.open_short_df.copy() df['target_type'] = target_type.value if pd_is_not_null(df): drawer = Drawer(NormalData(df=df)) drawer.draw_table(render=render, file_name=file_name, width=width, height=height, title=title, keep_ui_state=keep_ui_state)
def on_finish(self): last_year = str(now_pd_timestamp().year) codes = [item.code for item in self.entities] need_filleds = get_dividend_financing( provider=self.provider, codes=codes, return_type='domain', session=self.session, filters=[DividendFinancing.rights_raising_fund.is_(None)], end_timestamp=last_year) for item in need_filleds: df = get_rights_issue_detail( provider=self.provider, entity_id=item.entity_id, columns=[ RightsIssueDetail.timestamp, RightsIssueDetail.rights_raising_fund ], start_timestamp=item.timestamp, end_timestamp="{}-12-31".format(item.timestamp.year)) if pd_is_not_null(df): item.rights_raising_fund = df['rights_raising_fund'].sum() self.session.commit() super().on_finish()
def get_traders() -> List[str]: df = get_group(provider='zvt', data_schema=SimAccount, column=SimAccount.trader_name, group_func=None) if pd_is_not_null(df): return df['trader_name'].tolist() return []
def to_annotations(annotation_df: pd.DataFrame): """ annotation_df format: value flag color entity_id timestamp :param annotation_df: :type annotation_df: :return: :rtype: """ annotations = [] if pd_is_not_null(annotation_df): for trace_name, df in annotation_df.groupby(level=0): if pd_is_not_null(df): for (_, timestamp), item in df.iterrows(): if 'color' in item: color = item['color'] else: color = '#ec0000' value = round(item['value'], 2) annotations.append( dict( x=timestamp, y=value, xref='x', yref='y', text=item['flag'], showarrow=True, align='center', arrowhead=2, arrowsize=1, arrowwidth=2, # arrowcolor='#030813', ax=-10, ay=-30, bordercolor='#c7c7c7', borderwidth=1, bgcolor=color, opacity=0.8)) return annotations
def acc(self, input_df, acc_df) -> pd.DataFrame: if pd_is_not_null(acc_df): input_df = input_df[~input_df['id'].isin(acc_df['id'])] input_df = input_df.copy() for entity_id, df in input_df.groupby(level=0): pre_index = None pre_item = None current_state = 0 pre_state = 0 for index, item in df.iterrows(): if pre_item is not None: current_state = get_current_state(item, pre_item, current_state) input_df.loc[index, 'tmp_bi_state'] = current_state if (current_state != 0 and pre_state != 0) and current_state != pre_state: # -1 -> 1 if current_state == 1: input_df.loc[pre_index, 'tmp_di'] = True # 1 -> -1 if current_state == -1: input_df.loc[pre_index, 'tmp_ding'] = True pre_index = index pre_item = item pre_state = current_state print(input_df) self.logger.info('finish calculating :{}'.format(entity_id)) if pd_is_not_null(acc_df): if pd_is_not_null(input_df): df = input_df[set(acc_df.columns) & set(input_df.columns)] acc_df = acc_df.append(df) acc_df = acc_df.sort_index(level=[0, 1]) else: acc_df = input_df return acc_df
def run(self): """ """ if self.filter_factors: musts = [] for factor in self.filter_factors: df = factor.get_result_df() if not pd_is_not_null(df): raise Exception('no data for factor:{},{}'.format( factor.factor_name, factor)) if len(df.columns) > 1: s = df.agg("and", axis="columns") s.name = 'score' musts.append(s.to_frame(name='score')) else: df.columns = ['score'] musts.append(df) self.filter_result = list(accumulate(musts, func=operator.__and__))[-1] if self.score_factors: scores = [] for factor in self.score_factors: df = factor.get_result_df() if not pd_is_not_null(df): raise Exception('no data for factor:{],{}'.format( factor.factor_name, factor)) if len(df.columns) > 1: s = df.agg("mean", axis="columns") s.name = 'score' scores.append(s.to_frame(name='score')) else: df.columns = ['score'] scores.append(df) self.score_result = list(accumulate(scores, func=operator.__add__))[-1] self.generate_targets()
def get_entity_ids(entity_type='stock', exchanges=['sz', 'sh'], codes=None, provider='eastmoney'): df = get_entities(entity_type=entity_type, exchanges=exchanges, codes=codes, provider=provider) if pd_is_not_null(df): return df['entity_id'].to_list() return None
def get_targets(self, timestamp, target_type: TargetType = TargetType.open_long) -> pd.DataFrame: if target_type == TargetType.open_long: df = self.open_long_df if target_type == TargetType.open_short: df = self.open_short_df if pd_is_not_null(df): if timestamp in df.index: target_df = df.loc[[to_pd_timestamp(timestamp)], :] return target_df['entity_id'].tolist() return []
def get_entity_ids(entity_type='stock', entity_schema: EntityMixin = None, exchanges=None, codes=None, provider=None): df = get_entities(entity_type=entity_type, entity_schema=entity_schema, exchanges=exchanges, codes=codes, provider=provider) if pd_is_not_null(df): return df['entity_id'].to_list() return None
def load_window_df(self, provider, data_schema, window): window_df = None dfs = [] for entity_id in self.entity_ids: df = data_schema.query_data(provider=provider, index=[self.category_field, self.time_field], order=data_schema.timestamp.desc(), entity_id=entity_id, limit=window) if pd_is_not_null(df): dfs.append(df) if dfs: window_df = pd.concat(dfs) window_df = window_df.sort_index(level=[0, 1]) return window_df
def init_entities(df, entity_type='stock', provider='exchange'): df = df.drop_duplicates(subset=['id']) data_schema = get_entity_schema(entity_type) store_category = get_db_name(data_schema=data_schema) db_engine = get_db_engine(provider, db_name=store_category) security_schema = get_entity_schema(entity_type) current = get_entities(entity_type=entity_type, columns=[security_schema.id, security_schema.code], provider=provider) if pd_is_not_null(current): df = df[~df['id'].isin(current['id'])] df.to_sql(security_schema.__tablename__, db_engine, index=False, if_exists='append')
def normalize(self): """ normalize data_df to col1 col2 col3 entity_id index_field """ if pd_is_not_null(self.data_df): if not is_normal_df(self.data_df): self.data_df = normal_index_df(self.data_df) self.entity_ids = self.data_df.index.levels[0].to_list() for entity_id in self.entity_ids: df = self.data_df.loc[(entity_id, )] self.df_list.append(df) self.entity_map_df[entity_id] = df if len(self.df_list) > 1 and self.fill_index: self.df_list = fill_with_same_index(df_list=self.df_list)
def load_window_df(self, provider, data_schema): window_df = None if not self.entity_ids: self.entity_ids = get_entity_ids(provider='eastmoney', entity_type=self.entity_type, exchanges=self.exchanges, codes=self.codes) dfs = [] for entity_id in self.entity_ids: df = get_data(provider=provider, data_schema=data_schema, start_timestamp=self.start_timestamp, index=[self.category_field, self.time_field], order=data_schema.timestamp.desc(), entity_id=entity_id, limit=self.computing_window) if pd_is_not_null(df): dfs.append(df) if dfs: window_df = pd.concat(dfs) window_df = window_df.sort_index(level=[0, 1]) return window_df
def risky_company(the_date=to_pd_timestamp(now_time_str()), income_yoy=-0.1, profit_yoy=-0.1, entity_ids=None): codes = [] start_timestamp = to_pd_timestamp(the_date) - datetime.timedelta(130) # 营收降,利润降,流动比率低,速动比率低 finance_filter = or_(FinanceFactor.op_income_growth_yoy < income_yoy, FinanceFactor.net_profit_growth_yoy <= profit_yoy, FinanceFactor.current_ratio < 0.7, FinanceFactor.quick_ratio < 0.5) df = FinanceFactor.query_data(entity_ids=entity_ids, start_timestamp=start_timestamp, filters=[finance_filter], columns=['code']) if pd_is_not_null(df): codes = codes + df.code.tolist() # 高应收,高存货,高商誉 balance_filter = (BalanceSheet.accounts_receivable + BalanceSheet.inventories + BalanceSheet.goodwill) \ > BalanceSheet.total_equity / 2 df = BalanceSheet.query_data(entity_ids=entity_ids, start_timestamp=start_timestamp, filters=[balance_filter], columns=['code']) if pd_is_not_null(df): codes = codes + df.code.tolist() # 应收>利润*1/2 df1 = BalanceSheet.query_data(entity_ids=entity_ids, start_timestamp=start_timestamp, columns=[BalanceSheet.code, BalanceSheet.accounts_receivable]) if pd_is_not_null(df1): df1.drop_duplicates(subset='code', keep='last', inplace=True) df1 = df1.set_index('code', drop=True).sort_index() df2 = IncomeStatement.query_data(entity_ids=entity_ids, start_timestamp=start_timestamp, columns=[IncomeStatement.code, IncomeStatement.net_profit]) if pd_is_not_null(df2): df2.drop_duplicates(subset='code', keep='last', inplace=True) df2 = df2.set_index('code', drop=True).sort_index() if pd_is_not_null(df1) and pd_is_not_null(df2): codes = codes + df1[df1.accounts_receivable > df2.net_profit / 2].index.tolist() return list(set(codes))
def __init__( self, data_schema: object, entity_ids: List[str] = None, entity_type: str = 'stock', exchanges: List[str] = ['sh', 'sz'], codes: List[str] = None, the_timestamp: Union[str, pd.Timestamp] = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None, columns: List = None, filters: List = None, order: object = None, limit: int = None, provider: str = 'eastmoney', level: Union[str, IntervalLevel] = IntervalLevel.LEVEL_1DAY, category_field: str = 'entity_id', time_field: str = 'timestamp', computing_window: int = 250, # child added arguments keep_all_timestamp: bool = False, fill_method: str = 'ffill', effective_number: int = 10, transformer: Transformer = None, accumulator: Accumulator = None, need_persist: bool = True, dry_run: bool = False) -> None: super().__init__(data_schema, entity_ids, entity_type, exchanges, codes, the_timestamp, start_timestamp, end_timestamp, columns, filters, order, limit, provider, level, category_field, time_field, computing_window) self.factor_name = type(self).__name__.lower() self.keep_all_timestamp = keep_all_timestamp self.fill_method = fill_method self.effective_number = effective_number self.transformer = transformer self.accumulator = accumulator self.need_persist = need_persist self.dry_run = dry_run # 计算因子的结果,可持久化 self.factor_df: pd.DataFrame = None # 中间结果,不持久化 self.pipe_df: pd.DataFrame = None # result_df是用于选股的标准df self.result_df: pd.DataFrame = None # 如果是accumulate类的运算,需要利用之前的factor_df,比如全市场的一些统计信息 if self.need_persist: # 如果只是为了计算因子,只需要读取valid_window的factor_df if self.dry_run: self.factor_df = self.load_window_df( provider='zvt', data_schema=self.factor_schema) else: self.factor_df = get_data( provider='zvt', data_schema=self.factor_schema, start_timestamp=self.start_timestamp, index=[self.category_field, self.time_field]) if pd_is_not_null(self.factor_df): dfs = [] for entity_id, df in self.data_df.groupby(level=0): if entity_id in self.factor_df.index.levels[0]: df = df[df.timestamp >= self.factor_df.loc[( entity_id, )].index[0]] dfs.append(df) self.data_df = pd.concat(dfs) self.register_data_listener(self)
def do_compute(self): super().do_compute() if pd_is_not_null(self.pipe_df) and self.scorer: self.result_df = self.scorer.score(self.data_df)
def pre_compute(self): if not pd_is_not_null(self.pipe_df): self.pipe_df = self.data_df
def acc(self, input_df, acc_df) -> pd.DataFrame: short_ma_col = 'ma{}'.format(self.short_window) long_ma_col = 'ma{}'.format(self.long_window) input_df['score'] = input_df[short_ma_col] > input_df[long_ma_col] # 过滤掉已经计算的时间 if pd_is_not_null(acc_df): dfs = [] for entity_id, df in input_df.groupby(level=0): if entity_id in acc_df.index.levels[0]: df = df[df.timestamp > acc_df.loc[(entity_id, )].index[-1]] dfs.append(df) input_df = pd.concat(dfs, sort=False) for entity_id, df in input_df.groupby(level=0): count = 0 pct = 1 current_state = None pre_index = None check_acc = False for index, item in df['score'].iteritems(): # 5日线在10日线之上 if item: state = 'up' # 5日线在10日线之下 elif not pd.isna(df[short_ma_col][index]) and not pd.isna( df[long_ma_col][index]): state = 'down' else: continue # 计算维持状态('up','down')的 次数 if current_state == state: if count > 0: count = count + 1 else: count = count - 1 if pct == 0: pct = df['change_pct'][index] else: pct = (1 + pct) * (1 + df['change_pct'][index]) - 1 else: # 状态切换,设置前一状态的总和 if count != 0: input_df.loc[pre_index, self.total_col] = count current_state = state if current_state == 'up': count = 1 else: count = -1 pct = 0 # 增量计算,需要累加之前的结果 if pd_is_not_null(acc_df) and not check_acc: if entity_id in acc_df.index.levels[0]: acc_col_current = acc_df.loc[( entity_id, )].iloc[-1][self.current_col] if not pd.isna(acc_col_current): # up if acc_col_current > 0 and (current_state == 'up'): count = acc_col_current + 1 # down elif acc_col_current < 0 and (current_state == 'down'): count = acc_col_current - 1 # state has changed else: pre_timestamp = acc_df.loc[(entity_id, ), 'timestamp'][-1] acc_df.loc[ (entity_id, pre_timestamp), self.total_col] = acc_col_current check_acc = True # 设置目前状态 input_df.loc[index, self.current_col] = count input_df.loc[index, 'current_pct'] = pct pre_index = index self.logger.info('finish calculating :{}'.format(entity_id)) if pd_is_not_null(acc_df): if pd_is_not_null(input_df): df = input_df[set(acc_df.columns) & set(input_df.columns)] acc_df = acc_df.append(df, sort=False) acc_df = acc_df.sort_index(level=[0, 1]) else: acc_df = input_df return acc_df
def empty(self): return not pd_is_not_null(self.data_df)
def record(self, entity, start, end, size, timestamps): if not end: end = now_pd_timestamp() date_range = pd.date_range(start=start, end=end, freq='1D').tolist() for date in date_range: # etf包含的个股和比例 etf_stock_df = get_etf_stocks(code=entity.code, timestamp=date, provider=self.provider) if pd_is_not_null(etf_stock_df): all_pct = etf_stock_df['proportion'].sum() if all_pct >= 1.2 or all_pct <= 0.8: self.logger.error( f'ignore etf:{entity.id} date:{date} proportion sum:{all_pct}' ) break etf_stock_df.set_index('stock_id', inplace=True) # 个股的估值数据 stock_valuation_df = StockValuation.query_data( entity_ids=etf_stock_df.index.to_list(), filters=[StockValuation.timestamp == date], index='entity_id') if pd_is_not_null(stock_valuation_df): stock_count = len(etf_stock_df) valuation_count = len(stock_valuation_df) self.logger.info( f'etf:{entity.id} date:{date} stock count: {stock_count},' f'valuation count:{valuation_count}') pct = abs(stock_count - valuation_count) / stock_count if pct >= 0.2: self.logger.error( f'ignore etf:{entity.id} date:{date} pct:{pct}') break se = pd.Series({ 'id': "{}_{}".format(entity.id, date), 'entity_id': entity.id, 'timestamp': date, 'code': entity.code, 'name': entity.name }) for col in ['pe', 'pe_ttm', 'pb', 'ps', 'pcf']: # PE=P/E # 这里的算法为:将其价格都设为PE,那么Earning为1(亏钱为-1),结果为 总价格(PE)/总Earning value = 0 price = 0 # 权重估值 positive_df = stock_valuation_df[[ col ]][stock_valuation_df[col] > 0] positive_df['count'] = 1 positive_df = positive_df.multiply( etf_stock_df["proportion"], axis="index") if pd_is_not_null(positive_df): value = positive_df['count'].sum() price = positive_df[col].sum() negative_df = stock_valuation_df[[ col ]][stock_valuation_df[col] < 0] if pd_is_not_null(negative_df): negative_df['count'] = 1 negative_df = negative_df.multiply( etf_stock_df["proportion"], axis="index") value = value - negative_df['count'].sum() price = price + negative_df[col].sum() se[f'{col}1'] = price / value # 简单算术平均估值 positive_df = stock_valuation_df[col][ stock_valuation_df[col] > 0] positive_count = len(positive_df) negative_df = stock_valuation_df[col][ stock_valuation_df[col] < 0] negative_count = len(negative_df) value = positive_count - negative_count price = positive_df.sum() + abs(negative_df.sum()) se[col] = price / value df = se.to_frame().T self.logger.info(df) df_to_db(df=df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) return None
def __init__(self, data_schema: Mixin, entity_schema: EntityMixin, provider: str = None, entity_provider: str = None, entity_ids: List[str] = None, exchanges: List[str] = None, codes: List[str] = None, the_timestamp: Union[str, pd.Timestamp] = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = now_pd_timestamp(), columns: List = None, filters: List = None, order: object = None, limit: int = None, level: IntervalLevel = IntervalLevel.LEVEL_1DAY, category_field: str = 'entity_id', time_field: str = 'timestamp', computing_window: int = None) -> None: self.logger = logging.getLogger(self.__class__.__name__) self.data_schema = data_schema self.entity_schema = entity_schema self.provider = provider self.entity_provider = entity_provider self.the_timestamp = the_timestamp if the_timestamp: self.start_timestamp = the_timestamp self.end_timestamp = the_timestamp else: self.start_timestamp = start_timestamp self.end_timestamp = end_timestamp self.start_timestamp = to_pd_timestamp(self.start_timestamp) self.end_timestamp = to_pd_timestamp(self.end_timestamp) self.exchanges = exchanges if codes: if type(codes) == str: codes = codes.replace(' ', '') if codes.startswith('[') and codes.endswith(']'): codes = json.loads(codes) else: codes = codes.split(',') self.codes = codes self.entity_ids = entity_ids # 转换成标准entity_id if entity_schema and not self.entity_ids: df = get_entities(entity_schema=entity_schema, provider=self.entity_provider, exchanges=self.exchanges, codes=self.codes) if pd_is_not_null(df): self.entity_ids = df['entity_id'].to_list() self.filters = filters self.order = order self.limit = limit if level: self.level = IntervalLevel(level) else: self.level = level self.category_field = category_field self.time_field = time_field self.computing_window = computing_window self.category_col = eval('self.data_schema.{}'.format(self.category_field)) self.time_col = eval('self.data_schema.{}'.format(self.time_field)) self.columns = columns # we store the data in a multiple index(category_column,timestamp) Dataframe if self.columns: # support str if type(columns[0]) == str: self.columns = [] for col in columns: self.columns.append(eval('data_schema.{}'.format(col))) # always add category_column and time_field for normalizing self.columns = list(set(self.columns) | {self.category_col, self.time_col}) self.data_listeners: List[DataListener] = [] self.data_df: pd.DataFrame = None self.load_data()
def move_on(self, to_timestamp: Union[str, pd.Timestamp] = None, timeout: int = 20) -> object: """ using continual fetching data in realtime 1)get the data happened before to_timestamp,if not set,get all the data which means to now 2)if computing_window set,the data_df would be cut for saving memory :param to_timestamp: :type to_timestamp: :param timeout: :type timeout: int :return: :rtype: """ if not pd_is_not_null(self.data_df): self.load_data() return start_time = time.time() # FIXME:we suppose history data should be there at first has_got = [] dfs = [] changed = False while True: for entity_id, df in self.data_df.groupby(level=0): if entity_id in has_got: continue recorded_timestamp = df['timestamp'].max() # move_on读取数据,表明之前的数据已经处理完毕,只需要保留computing_window的数据 if self.computing_window: df = df.iloc[-self.computing_window:] added_filter = [ self.category_col == entity_id, self.time_col > recorded_timestamp ] if self.filters: filters = self.filters + added_filter else: filters = added_filter added_df = self.data_schema.query_data( provider=self.provider, columns=self.columns, end_timestamp=to_timestamp, filters=filters, level=self.level, index=[self.category_field, self.time_field]) if pd_is_not_null(added_df): self.logger.info('entity_id:{},added:\n{}'.format( entity_id, added_df)) for listener in self.data_listeners: listener.on_entity_data_changed(entity=entity_id, added_data=added_df) # if got data,just move to another entity_id changed = True has_got.append(entity_id) df = df.append(added_df, sort=False) dfs.append(df) else: cost_time = time.time() - start_time if cost_time > timeout: # if timeout,just add the old data has_got.append(entity_id) dfs.append(df) self.logger.warning( 'category:{} level:{} getting data timeout,to_timestamp:{},now:{}' .format(entity_id, self.level, to_timestamp, now_pd_timestamp())) continue if len(has_got) == len(self.data_df.index.levels[0]): break if dfs: self.data_df = pd.concat(dfs, sort=False) self.data_df.sort_index(level=[0, 1]) if changed: for listener in self.data_listeners: listener.on_data_changed(self.data_df)
def record(self, entity, start, end, size, timestamps): if not end: end = now_pd_timestamp() date_range = pd.date_range(start=start, end=end, freq='1D').tolist() for date in date_range: # etf包含的个股和比例 etf_stock_df = get_etf_stocks(code=entity.code, timestamp=date, provider=self.provider) all_pct = etf_stock_df['proportion'].sum() if all_pct >= 1.1 or all_pct <= 0.9: self.logger.info( f'etf:{entity.id} date:{date} proportion sum:{all_pct}') if pd_is_not_null(etf_stock_df): etf_stock_df.set_index('stock_id', inplace=True) # 个股的估值数据 stock_valuation_df = StockValuation.query_data( entity_ids=etf_stock_df.index.to_list(), filters=[StockValuation.timestamp == date], index='entity_id') if pd_is_not_null(stock_valuation_df): # 暂时只支持 简单算术平均估值,理由:模糊的正确比精确的错误有用 # A股个股的市值往往相差很大,按市值权重的话,这样的估值很难反映整体 self.logger.info( f'etf:{entity.id} date:{date} stock count: {len(etf_stock_df)},valuation count:{len(stock_valuation_df)}' ) # # 静态pe # pe = Column(Float) # # 动态pe # pe_ttm = Column(Float) # # 市净率 # pb = Column(Float) # # 市销率 # ps = Column(Float) # # 市现率 # pcf = Column(Float) se = pd.Series({ 'id': "{}_{}".format(entity.id, date), 'entity_id': entity.id, 'timestamp': date, 'code': entity.code, 'name': entity.name }) for col in ['pe', 'pe_ttm', 'pb', 'ps', 'pcf']: # PE=P/E # 这里的算法为:将其价格都设为1,算出总earning,再相除 positive_df = stock_valuation_df[col][ stock_valuation_df[col] > 0] positive_count = len(positive_df) negative_df = stock_valuation_df[col][ stock_valuation_df[col] < 0] negative_count = len(negative_df) result = (positive_count + negative_count) / ( positive_count / positive_df.mean() + negative_count / negative_df.mean()) se[col] = result df = se.to_frame().T self.logger.info(df) df_to_db(df=df, data_schema=self.data_schema, provider=self.provider, force_update=self.force_update) return None