def kw_search(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]: rlt_articles: List[Article] = list() rlt_files = glob.glob(os.path.join(rlt_path, "*_kw_search_result.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) kw, df_rlt = SeekingAlphaDataProcess.proc_kw_search( df_from_csv) # print(df_rlt) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("news_url", np.NaN)): article_url = furl(dict_row["news_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况 if abs_url_str.find("seekingalpha") > 0: dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) else: dict_article["full_text_url"] = article_url.url dict_article["uuid"] = md5_str(article_url.url) if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() if not pd.isna(dict_row.get("symbols", np.NaN)): symbols = [ x.strip() for x in dict_row.get("symbols").split(",") ] ls_related_symbols: List[FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=x) for x in symbols ] if save_doc: map(upsert_document, ls_related_symbols) dict_article["related_symbols"] = ls_related_symbols dict_article["from_searching_phase"] = SearchingPhrase( searching_phrase=kw) dict_article["engine_site"] = "SeekingAlpha" dict_article["channel_in_site"] = "Search" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) return rlt_articles
def __init__(self, experiment_name: str, template_workflow_yml_file: str, trial_params: Mapping, trial_uuid: str, experiment_uuid: str): super().__init__() self.workflow_cfg, self.workflow_context = load_mapping_from_file( template_workflow_yml_file) self.trial_params = trial_params for k, v in trial_params.items(): changed_items = upsert_step_cfg_para(self.workflow_cfg, self.workflow_context, k, v) if changed_items == 0: logger.error( f"'{k}' is not available in config setting , should check search space file" ) self.experiment_name = experiment_name self.trial_uuid = trial_uuid self.experiment_uuid = experiment_uuid self.is_trial_finished = False self.cfg_hash = md5_str( HashCalculation.value_to_hash_str(self.workflow_cfg)) self._trial_finished_future = asyncio.get_event_loop().create_future() # ---- [laigen 2020.02.29] colab train 不再通过 kafka 进行,这里先去掉。 ---- # colab_side_env = TrialColabSideEnv(self.cfg_hash, self.trial_uuid) # self._colab_side_env_pk = colab_side_env.pk # ---- end ---- self.metrics_reporter = TrailMetricsArcticReporter( self.experiment_name, self.experiment_uuid, self.trial_uuid) self.latest_epoch = None self.final_val = None
def upsert_yahoo_recommend(symbol: str): yahoo_ts = yFinanceData() try: df = yahoo_ts.recommendations(symbol_to_yahoo_symbol(symbol)) logger.info(f"upsert_yahoo_recommend : {symbol}-{df.shape}") for t, row in df.iterrows(): firm = GlobalEntity(entity_id=row["Firm"]) dict_info = { "t": t, "symbol": FinancialInstrumentSymbol(symbol=symbol), "firm": firm, "uid": md5_str(f"{t.isoformat()}-{symbol}-{row['Firm']}"), "to_grade": None if row["To Grade"] == "" else row["To Grade"], "from_grade": None if row["From Grade"] == "" else row["From Grade"], "action": None if row["Action"] == "" else row["Action"] } upsert_document(firm) upsert_document(FinancialInstrumentRecommendInYahoo(**dict_info), False) except Exception as ex: logger.error(ex) return
def upsert_splits(symbol: str, start_t: datetime): yahoo_ts = yFinanceData() try: df = yahoo_ts.splits(symbol_to_yahoo_symbol(symbol), start=start_t) logger.info(f"upsert_splits : {symbol}-{df.shape}") for t, row in df.iterrows(): dict_info = { "t": t, "split": row["Stock Splits"], "symbol": FinancialInstrumentSymbol(symbol=symbol), "uid": md5_str(f"{t.isoformat()}-{symbol}") } upsert_document(FinancialInstrumentDividends(**dict_info)) except: return
def kw_news_search(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]: rlt_articles: List[Article] = list() rlt_files = glob.glob(os.path.join(rlt_path, "*_index_items.csv")) if rlt_files: for i, f in enumerate(rlt_files): try: df_from_csv = pd.read_csv(f, header=0, parse_dates=["publish_time"]) except: continue kw, df_rlt = GoogleNewsSearchProcess.proc_news_search_data(df_from_csv) if df_rlt is None: continue logger.info(f"proecess file : {f} - {df_rlt.shape}") # print(df_rlt) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("news_title", np.NaN)): dict_article["title"] = dict_row["news_title"] if not pd.isna(dict_row.get("url", np.NaN)): dict_article["full_text_url"] = dict_row["url"] if not pd.isna(dict_row.get("news_abstract", np.NaN)): dict_article["abstract"] = dict_row["news_abstract"] if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row["publish_time"].to_pydatetime() search_phrase_in_db = SearchingPhrase.objects(searching_phrase=kw).first() if search_phrase_in_db is not None: dict_article["from_searching_phase"] = search_phrase_in_db if search_phrase_in_db.related_symbols is not None: dict_article["related_symbols"] = search_phrase_in_db.related_symbols else: dict_article["from_searching_phase"] = SearchingPhrase(searching_phrase=kw) dict_article["engine_site"] = "google_news" if not pd.isna(dict_row.get("publisher", np.NaN)): dict_article["channel_in_site"] = dict_row["publisher"] dict_article["uuid"] = md5_str(f"{dict_article['channel_in_site']}-{dict_article['title']}") else: # 暂定没有 publisher 的 news 不入库 continue dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) return rlt_articles
def __init__(self, model_name: str, model_inst_gid: str, template_workflow_yml_file: str, changed_params: Mapping, pred_name: str): super().__init__() self._model_name = model_name self._model_inst_gid = model_inst_gid self._pred_name = pred_name self.workflow_cfg, self.workflow_context = load_mapping_from_file( template_workflow_yml_file) for k, v in changed_params.items(): changed_items = upsert_step_cfg_para(self.workflow_cfg, self.workflow_context, k, v) if changed_items == 0: logger.error( f"'{k}' is not available in config setting , should check changed_params file" ) self.cfg_hash = md5_str( HashCalculation.value_to_hash_str(self.workflow_cfg))
def upsert_yahoo_symbol_holder(symbol: str): import yfinance as yf symbol_data = yf.Ticker(symbol_to_yahoo_symbol(symbol)) try: df = symbol_data.institutional_holders logger.info(f"upsert_yahoo_symbol_holder : {symbol} {df.shape}") for idx, row in df.iterrows(): t = row["Date Reported"] dict_info = { "t": t, "symbol": FinancialInstrumentSymbol(symbol=symbol), "holder": GlobalEntity(entity_id=row["Holder"]), "shares": row["Shares"], "value": row["Value"], "percentage_out": row["% Out"], "uid": md5_str(f"{t.isoformat()}-{symbol}-{row['Holder']}") } upsert_document(FinancialInstrumentHolders(**dict_info), True) except Exception as ex: logger.error(f"api exception when get data , {ex}")
def upsert_daily_market_data(symbol: str, start_t: datetime): yahoo_ts = yFinanceData() df = yahoo_ts.history(symbol_to_yahoo_symbol(symbol), start=start_t) logger.info(f"upsert_daily_market_data : {symbol}-{df.shape}") df["fifty_two_week_high"] = df["Close"].rolling(window=244).max() df["fifty_two_week_low"] = df["Close"].rolling(window=244).min() for t, row in df.iterrows(): dict_info = { "t": t, "open": row["Open"], "high": row["High"], "low": row["Low"], "close": row["Close"], "volume": row["Volume"], "dividends": row["Dividends"], "splits": row["Stock Splits"], "fifty_two_week_low": row["fifty_two_week_low"], "fifty_two_week_high": row["fifty_two_week_high"], "symbol": FinancialInstrumentSymbol(symbol=symbol), "uid": md5_str(f"{t.isoformat()}-{symbol}") } upsert_document(FinancialInstrumentDailyMarketData(**dict_info))
def upsert_yahoo_earning_analysis(symbol: str): """see https://finance.yahoo.com/quote/MSFT/analysis?p=MSFT""" import yfinance as yf symbol_data = yf.Ticker(symbol_to_yahoo_symbol(symbol)) try: df = symbol_data.calendar.T logger.info(f"upsert_yahoo_earning_analysis : {symbol} {df.shape}") for idx, row in df.iterrows(): t = row["Earnings Date"] dict_info = { "t": t, "symbol": FinancialInstrumentSymbol(symbol=symbol), "earnings_average": _convert_non_t_v(row["Earnings Average"]), "earnings_low": _convert_non_t_v(row["Earnings Low"]), "earnings_high": _convert_non_t_v(row["Earnings High"]), "revenue_average": _convert_non_t_v(row["Revenue Average"]), "revenue_low": _convert_non_t_v(row["Revenue Low"]), "revenue_high": _convert_non_t_v(row["Revenue High"]), "uid": md5_str(f"{t.isoformat()}-{symbol}") } upsert_document(FinancialInstrumentCalendarFromYahoo(**dict_info), True) except Exception as ex: logger.error(f"api exception when get data , {ex}")
def kw_search( rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True ) -> Tuple[List[Article], List[UserInTwitter]]: rlt_articles: List[Article] = list() rlt_posters: List[UserInTwitter] = list() rlt_files = glob.glob(os.path.join(rlt_path, "*_posts.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv( f, header=0, parse_dates=["post_time", "extract_t"]) kw, df_rlt = TwitterDataProcess.proc_posts(df_from_csv) # print(kw) # print(df_rlt) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("post_content", np.NaN)): dict_article["title"] = dict_row["post_content"] if not pd.isna(dict_row.get("post_content_detail", np.NaN)): dict_article["abstract"] = dict_row[ "post_content_detail"] if not pd.isna(dict_row.get("post_additional_url", np.NaN)): dict_article["full_text_url"] = dict_row[ "post_additional_url"] if not pd.isna(dict_row.get("post_image_url", np.NaN)): dict_article["related_image_url"] = dict_row[ "post_image_url"] if dict_row.get("post_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "post_time"].to_pydatetime() if not pd.isna(dict_row.get("poster_name", np.NaN)): poster = UserInTwitter(user_id=dict_row["poster_id"], name=dict_row["poster_name"]) else: poster = UserInTwitter(user_id=dict_row["poster_id"]) dict_article["twitter_poster"] = poster # uuid 的计算规则为 posterid + post_time dict_article["uuid"] = md5_str( f"{poster.user_id}|{dict_article['publish_time'].isoformat()}" ) if not pd.isna(dict_row.get( "comments", np.NaN)) or not pd.isna( dict_row.get( "retweet", np.NaN)) or not pd.isna( dict_row.get("retweet", np.NaN)): extra_data = TweetExtra() if not pd.isna(dict_row.get("comments", np.NaN)): extra_data.comments = int(dict_row["comments"]) if not pd.isna(dict_row.get("retweet", np.NaN)): extra_data.retweet = int(dict_row["retweet"]) if not pd.isna(dict_row.get("like", np.NaN)): extra_data.like = int(dict_row["like"]) dict_article["tweet_extra"] = extra_data search_phrase_in_db = SearchingPhrase.objects( searching_phrase=kw).first() if search_phrase_in_db is not None: dict_article[ "from_searching_phase"] = search_phrase_in_db if search_phrase_in_db.related_symbols is not None: dict_article[ "related_symbols"] = search_phrase_in_db.related_symbols else: dict_article["from_searching_phase"] = SearchingPhrase( searching_phrase=kw) dict_article["engine_site"] = "Twitter" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) rlt_files = glob.glob( os.path.join(rlt_path, "*_follower_following.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) t, kw, df_rlt = TwitterDataProcess.proc_follower_following_info( df_from_csv, "search_phase") # print(kw) # print(df_rlt) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() twitter_user = UserInTwitter(user_id=dict_row["poster_id"]) if not pd.isna(dict_row.get("following", np.NaN)): twitter_user.following = int(dict_row["following"]) if not pd.isna(dict_row.get("follower", np.NaN)): twitter_user.follower = int(dict_row["follower"]) twitter_user.mtime = t twitter_user.batch_action_uuid = batch_action_uuid rlt_posters.append(twitter_user) if save_doc: upsert_document(twitter_user, True) return rlt_articles, rlt_posters
def symbol_analysis_report(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]: rlt_articles: List[Article] = list() rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0) df_rlt = EastMoneyDataProcess.proc_stock_analysis(df_from_csv) if df_rlt is not None: for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("article", np.NaN)): dict_article["title"] = dict_row["article"] if not pd.isna(dict_row.get("article_url", np.NaN)): dict_article["full_text_url"] = dict_row[ "article_url"] dict_article["uuid"] = md5_str( dict_row["article_url"]) if not pd.isna(dict_row.get("org_url", np.NaN)): # 暂时先把机构名直接存在 seeking alpha 的 author 数据中,这样画图方便一些 author_url: str = dict_row["org_url"] author_id = dict_row["org"] # NO author_name extracted!! # author_name = None # if not pd.isna(dict_row.get("author", np.NaN)): # author_name = dict_row["author"] author = AuthorInSeekingAlpha(author_id=author_id, url=author_url) if not pd.isna( dict_row.get("reports_within_one_month", np.NaN)): author.articles = dict_row[ "reports_within_one_month"] dict_article["seeking_alpha_author"] = author if save_doc: upsert_document(author, True) if not pd.isna(dict_row.get("rating", np.NaN)): dict_article["rating"] = dict_row["rating"] if not pd.isna(dict_row.get("rating_chg", np.NaN)): dict_article["rating_change"] = dict_row[ "rating_chg"] if not pd.isna(dict_row.get("pred_2020_ret", np.NaN)): dict_article["pred_ret_this_yr"] = dict_row[ "pred_2020_ret"] if not pd.isna(dict_row.get("pred_2020_pe", np.NaN)): dict_article["pred_pe_this_yr"] = dict_row[ "pred_2020_pe"] if not pd.isna(dict_row.get("pred_2021_pe", np.NaN)): dict_article["pred_pe_next_yr"] = dict_row[ "pred_2021_pe"] if not pd.isna(dict_row.get("pred_2021_ret", np.NaN)): dict_article["pred_ret_next_yr"] = dict_row[ "pred_2021_ret"] if dict_row.get("report_date", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "report_date"].to_pydatetime() symbol = dict_row["symbol"] ls_related_symbols: List[FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=symbol) ] if save_doc: ls_related_symbols[0].full_name = dict_row[ "symbol_name"] upsert_document(ls_related_symbols[0]) dict_article["related_symbols"] = ls_related_symbols dict_article["engine_site"] = "EastMoney" dict_article["channel_in_site"] = "StockAnalysis" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) return rlt_articles
def get_hash_str(self) -> str: hp_dict = self.get_init_value_dict(True) return md5_str(HashCalculation.value_to_hash_str(hp_dict))
def author_detail( rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True ) -> Tuple[List[AuthorInSeekingAlpha], List[Article]]: rlt_articles: List[Article] = list() rlt_authors: List[AuthorInSeekingAlpha] = list() # region author articles rlt_files = glob.glob(os.path.join(rlt_path, "*_author_articles.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) author_id, df_rlt = SeekingAlphaDataProcess.proc_author_articles( df_from_csv) if df_rlt is not None: for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("news_url", np.NaN)): article_url = furl(dict_row["news_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() if not pd.isna(dict_row.get("comments", np.NaN)): dict_article[ "seeking_alpha_extra"] = SeekingAlphaArticleExtra( comments=dict_row["comments"]) if not pd.isna(dict_row.get("symbols", np.NaN)): ls_related_symbols: List[ FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=s.strip()) for s in dict_row["symbols"].split(",") ] if ls_related_symbols: for symbol in ls_related_symbols: if save_doc: upsert_document(symbol, True) dict_article[ "related_symbols"] = ls_related_symbols dict_article["engine_site"] = "SeekingAlpha" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) # endregion # region author info rlt_files = glob.glob(os.path.join(rlt_path, "*_author_info.csv")) if rlt_files: for f in rlt_files: logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0) dict_author_info = SeekingAlphaDataProcess.proc_author_info( df_from_csv) if not dict_author_info: continue if "author" not in dict_author_info: continue author = AuthorInSeekingAlpha( author_id=dict_author_info.get("author"), intro=dict_author_info.get("author_intro", ""), articles=dict_author_info.get("articles", None), picks=dict_author_info.get("authors_picks", None), blog_posts=dict_author_info.get("instablogs", None), comments=dict_author_info.get("comments", None), stock_talks=dict_author_info.get("stocktalks", None), likes=dict_author_info.get("likes", None), followers=dict_author_info.get("followers", None), following=dict_author_info.get("following", None), mtime=datetime.now(), batch_action_uuid=batch_action_uuid) if save_doc: upsert_document(author, True) rlt_authors.append(author) # endregion return rlt_authors, rlt_articles
def symbol_summary( rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True ) -> Tuple[List[FinancialInstrumentSymbol], List[Article]]: rlt_articles: List[Article] = list() rlt_symbols: List[FinancialInstrumentSymbol] = list() # region symbol analysis rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_analysis( df_from_csv) if df_rlt is not None: for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("article_url", np.NaN)): article_url = furl(dict_row["article_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) if not pd.isna(dict_row.get("author_url", np.NaN)): author_url: str = dict_row["author_url"] author_id = dict_row["author_id"] # NO author_name extracted!! # author_name = None # if not pd.isna(dict_row.get("author", np.NaN)): # author_name = dict_row["author"] author = AuthorInSeekingAlpha(author_id=author_id, url=author_url) dict_article["seeking_alpha_author"] = author if save_doc: upsert_document(author, True) if not pd.isna(dict_row.get("rating", np.NaN)): dict_article["rating"] = dict_row["rating"] if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() if not pd.isna(dict_row.get("comments", np.NaN)): dict_article[ "seeking_alpha_extra"] = SeekingAlphaArticleExtra( comments=dict_row["comments"]) ls_related_symbols: List[FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=symbol) ] if save_doc: upsert_document(ls_related_symbols[0]) dict_article["related_symbols"] = ls_related_symbols dict_article["engine_site"] = "SeekingAlpha" dict_article["channel_in_site"] = "analysis" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) # endregion # region symbol news rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_news.csv")) for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_news( df_from_csv) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("news_url", np.NaN)): article_url = furl(dict_row["news_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况 if abs_url_str.find("seekingalpha") > 0: dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) else: dict_article["full_text_url"] = article_url.url dict_article["uuid"] = md5_str(article_url.url) if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() if not pd.isna(dict_row.get("comments", np.NaN)): dict_article[ "seeking_alpha_extra"] = SeekingAlphaArticleExtra( comments=dict_row["comments"]) ls_related_symbols: List[FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=symbol) ] if save_doc: upsert_document(ls_related_symbols[0]) dict_article["related_symbols"] = ls_related_symbols dict_article["engine_site"] = "SeekingAlpha" dict_article["batch_action_uuid"] = batch_action_uuid if not pd.isna(dict_row.get("orig_source", np.NaN)): dict_article["channel_in_site"] = dict_row["orig_source"] article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) # endregion # region symbol info rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_indicators.csv")) if rlt_files: for f in rlt_files: logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0) dict_symbol_info = SeekingAlphaDataProcess.proc_symbol_indicator( df_from_csv) if not dict_symbol_info: continue if "symbol" not in dict_symbol_info: continue symbol = FinancialInstrumentSymbol( symbol=dict_symbol_info.get("symbol"), info_from_seeking_alpha=SymbolInfoBySeekingAlpha( followers=dict_symbol_info.get("followers", None), high_52wk=dict_symbol_info.get("52wk high", None), low_52wk=dict_symbol_info.get("52wk low", None), eps_fwd=dict_symbol_info.get("EPS (FWD)", None), pe_fwd=dict_symbol_info.get("PE (FWD)", None), yield_fwd=dict_symbol_info.get("Yield (FWD)", None), div_rate_fwd=dict_symbol_info.get( "Div Rate (FWD)", None), mkt_cap=dict_symbol_info.get("Market Cap", None), volume=dict_symbol_info.get("Volume", None), mtime=datetime.now())) if save_doc: upsert_document(symbol, True) rlt_symbols.append(symbol) # endregion return rlt_symbols, rlt_articles
def create_equity_workflow(req: WorkflowRequest): assert req.workflow_name in GSPredefinedWorkflow._value2member_map_ equity_entity = find_equity(req.entity_str) if equity_entity is None: wf_batch_uuid = md5_str( f"{req.request_from_account}-{req.ctime.isoformat()}-{req.entity_str}" ) doc_wf = TriggeredWebPagesCrawlWorkflow( uuid=wf_batch_uuid, workflow=PredefinedWorkflow(workflow_name=req.workflow_name), para_begin=req.para_begin, para_end=req.para_begin, submit_account=req.request_from_account, submit_type=WorkflowSubmitType.HotKey.value, submit_time=req.ctime, finish_or_error_flag=WorkflowStatusFlag.WithError.value, error_msg=f"Can't find equity symbol or name by '{req.entity_str}'" ) upsert_document(doc_wf, False) return # 找到了 entity, 生成 workflow 的内容 wf_batch_uuid = md5_str( f"{equity_entity.symbol}-{req.workflow_name}-{req.para_begin}-{req.para_end}-{req.request_from_account}-{req.ctime.isoformat()}" ) # 查询 workflow 预设的更新频率 # wf_freq = "D" wf_freq = "1s" workflow_def = PredefinedWorkflow.objects( workflow_name=req.workflow_name).first() if workflow_def is not None: wf_freq = workflow_def.refresh_freq # 找一下该 symbol 的 workflow 最近一次的执行时间(假定 Per Symbol + Per Account) latest_workflow_inst = TriggeredWebPagesCrawlWorkflow.objects( fin_instrument=equity_entity.symbol, workflow=req.workflow_name, submit_account=req.request_from_account, finish_or_error_flag__in=[ WorkflowStatusFlag.WaitToRun.value, WorkflowStatusFlag.SuccessFinished.value ]).order_by("-submit_time").first() # 如果在同一个周期的,直接记录一条错误的记录内容 if latest_workflow_inst is not None and is_same_period( latest_workflow_inst.submit_time, req.ctime, wf_freq): logger.error( f"Workflow(uuid={latest_workflow_inst.uuid},ctime='{latest_workflow_inst.submit_time}') in the same period is existed." ) doc_wf = TriggeredWebPagesCrawlWorkflow( uuid=wf_batch_uuid, main_entity_type=EntityType.Equity.value, fin_instrument=FinancialInstrumentSymbol( symbol=equity_entity.symbol), workflow=PredefinedWorkflow(workflow_name=req.workflow_name), para_begin=req.para_begin, para_end=req.para_begin, submit_account=req.request_from_account, submit_type=WorkflowSubmitType.HotKey.value, submit_time=req.ctime, finish_or_error_flag=WorkflowStatusFlag.WithError.value, error_msg= f"workflow '{req.workflow_name}'({equity_entity.symbol}) is executed at {latest_workflow_inst.submit_time} . No need to rerun now." ) upsert_document(doc_wf, False) return # 创建一个workflow doc_wf = TriggeredWebPagesCrawlWorkflow( uuid=wf_batch_uuid, main_entity_type=EntityType.Equity.value, fin_instrument=FinancialInstrumentSymbol(symbol=equity_entity.symbol), workflow=PredefinedWorkflow(workflow_name=req.workflow_name), para_begin=req.para_begin, para_end=req.para_begin, submit_account=req.request_from_account, submit_type=WorkflowSubmitType.HotKey.value, submit_time=req.ctime, finish_or_error_flag=WorkflowStatusFlag.WaitToRun.value) upsert_document(doc_wf, False) # 创建 batch action doc_batch_action = RPABatchAction( batch_id=wf_batch_uuid, is_dynamic_batch=True, from_function=cls_to_str(create_equity_workflow), ctime=req.ctime, status=ActionStatusFlag.WaitingForRun.value) upsert_document(doc_batch_action, False) # 依次调用 action generator 函数 # NOTE : 这里是直接访问 diction , 以后改为调用函数,就可以支持 register 的功能 for func in WORKFLOW_NAME_TO_ACTION_GENERATORS.get(req.workflow_name, []): func(equity_entity, wf_batch_uuid) logger.info(f"Batch action '{wf_batch_uuid}' is created.")
def upsert_tag_combination(tags: List[str]): kw_comb = KWCombinationPattern(tags_comb_hash=md5_str("".join(sorted(tags))), tags_value="+".join(tags), tags=[Tag(name=n) for n in tags]) kw_comb.save()
def process_action_result(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True): dict_extract_info: Dict[str, str] = {} extract_info_files = glob.glob( os.path.join(rlt_path, "*_extract_info.js")) if extract_info_files: with open(extract_info_files[0], "r") as f: dict_extract_info = json.load(f) article_files = glob.glob(os.path.join(rlt_path, "*_articles.csv")) ls_all_article_uuid: List[str] = list() """保存所有的 uuid ,用于到 Mongo 中查询哪些是已经 Insert 的""" ls_all_article_doc_dict: List[Dict] = list() """先保存到一个 list , 然后统一 upsert document """ if article_files: df_articles = pd.read_csv(article_files[0], header=0) for idx, row in df_articles.iterrows(): dict_row = row.to_dict() dict_article = dict() # "title" = > Article.title , 一般不做特殊处理,有则填入 if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] # "abstract" = > Article.abstract, 摘要,一般不做特殊处理 if not pd.isna(dict_row.get("abstract", np.NaN)): dict_article["abstract"] = dict_row["abstract"] # "channel_in_site" = > Article.channel_in_site, 二级分类,站点内的 if not pd.isna(dict_row.get("channel_in_site", np.NaN)): dict_article["channel_in_site"] = dict_row[ "channel_in_site"] # "full_text_url" = > Article.full_text_url , 正文的链接,需要做地址转换 if not pd.isna(dict_row.get("full_text_url", np.NaN)): full_text_url = dict_row["full_text_url"] # 尝试拼接 site page_url = dict_extract_info.get("real_url", "") if page_url: full_text_url = append_site_to_url( full_text_url, furl(page_url).origin) dict_article["full_text_url"] = full_text_url # "publish_time" = > Article.publish_time, 可能需要有数据预处理 if not pd.isna(dict_row.get("publish_time", np.NaN)): s_time = dict_row["publish_time"] s_time_preprocess = dict_extract_info.get( "publish_time_preprocess", "") if s_time_preprocess: assert s_time_preprocess in TIME_STR_PREPROCESS s_time = TIME_STR_PREPROCESS[s_time_preprocess](s_time) dt_time = _parse_date(s_time) if dt_time: dict_article["publish_time"] = dt_time if "article_site" in dict_extract_info: dict_article["engine_site"] = dict_extract_info[ "article_site"] # 生成 uuid if "full_text_url" in dict_article: # 优先考虑 url 作为 uuid dict_article["uuid"] = md5_str( dict_article["full_text_url"]) elif "title" in dict_article and "engine_site" in dict_article: # 标题+site作为 uuid # note: publish_time 不能计算 MD5 , 因为一些比较模糊的字符串,比如 5h 在不同的时间解析,会得到不一致的内容 dict_article["uuid"] = md5_str( f"{dict_article['engine_site']}-{dict_article['title']}" ) else: continue dict_article["action_uuid"] = action_uuid dict_article["batch_action_uuid"] = batch_action_uuid ls_all_article_uuid.append(dict_article["uuid"]) ls_all_article_doc_dict.append(dict_article) # TODO: 查询数据库中是否存在该项内容,用于标记 new / existed all_exist_docs = Article.objects( uuid__in=ls_all_article_uuid).only("uuid") set_existed_uuid = set([doc.uuid for doc in all_exist_docs]) # 更新 db related_articles: List[Article] = list() new_found_articles: List[Article] = list() for article_doc_dict in ls_all_article_doc_dict: related_articles.append(Article(uuid=article_doc_dict["uuid"])) if article_doc_dict["uuid"] in set_existed_uuid: logger.info( f"Doc '{article_doc_dict['uuid']}' is existed already.") continue new_found_articles.append(Article(uuid=article_doc_dict["uuid"])) if save_doc: upsert_document(Article(**article_doc_dict), True) logger.info( f"Action '{action_uuid}' found {len(new_found_articles)} new articles" ) # 存 img 和 pdf pdf_screen_files = glob.glob( os.path.join(rlt_path, "*_webpage_content.pdf")) orig_pdf_bin_uuid: str = None if pdf_screen_files: with open(pdf_screen_files[0], "rb") as f: bin_pdf = f.read() bin_pdf_gz = gzip.compress(bin_pdf) orig_pdf_bin_uuid = md5_binary(bin_pdf) if save_doc: pdf_attach = BinaryAttachment(uuid=orig_pdf_bin_uuid, bin_data=bin_pdf_gz, file_ext="pdf", is_gzip=True, bin_length=len(bin_pdf_gz), ctime=datetime.now(), action_uuid=action_uuid) upsert_document(pdf_attach, False) png_screenshot_files = glob.glob( os.path.join(rlt_path, "*_browser_sceenshot.png")) orig_png_bin_uuid: str = None if pdf_screen_files: with open(png_screenshot_files[0], "rb") as f: bin_png = f.read() bin_png_gz = gzip.compress(bin_png) orig_png_bin_uuid = md5_binary(bin_png) if save_doc: png_attach = BinaryAttachment(uuid=orig_png_bin_uuid, bin_data=bin_png_gz, file_ext="png", is_gzip=True, bin_length=len(bin_png_gz), ctime=datetime.now(), action_uuid=action_uuid) upsert_document(png_attach, False) # 存 GeneralBrowserActionInstance if save_doc: general_browser_action_doc = GeneralBrowserActionInstance( uuid=action_uuid, related_articles=related_articles, new_found_articles=new_found_articles, pdf_page_snapshot=BinaryAttachment(uuid=orig_pdf_bin_uuid), img_page_snapshot=BinaryAttachment(uuid=orig_png_bin_uuid)) upsert_document(general_browser_action_doc, False)
def column_articles(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]: rlt_articles = list() # region articles rlt_files = glob.glob(os.path.join(rlt_path, "*_articles.csv")) if rlt_files: for f in rlt_files: logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) kw, df_rlt = SeekingAlphaDataProcess.proc_article_data( df_from_csv) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() # for Article if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("author_url", np.NaN)): author_url: str = dict_row["author_url"] author_id = author_url.split("/")[-1] dict_article[ "seeking_alpha_author"] = AuthorInSeekingAlpha( author_id=author_id, url=author_url) if not pd.isna(dict_row.get("article_url", np.NaN)): article_url = furl(dict_row["article_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() dict_article["engine_site"] = "SeekingAlpha" ls_related_symbols: List[FinancialInstrumentSymbol] = list( ) for symbol_key_pair in [ ("related_symbol1", "related_symbol1_fullname"), ("related_symbol2", "related_symbol2_fullname"), ("related_symbol3", "related_symbol3_fullname") ]: if not pd.isna(dict_row.get( symbol_key_pair[0], np.NaN)) and not pd.isna( dict_row.get(symbol_key_pair[1], np.NaN)): fin_instrument_symbol = FinancialInstrumentSymbol( symbol=dict_row[symbol_key_pair[0]], full_name=dict_row[symbol_key_pair[1]], batch_action_uuid=batch_action_uuid) ls_related_symbols.append(fin_instrument_symbol) # ListField(ReferenceField(FinancialInstrumentSymbol)) 似乎不会级联保存,这里创建的时候同时保存 if save_doc: upsert_document(fin_instrument_symbol, True) if ls_related_symbols: dict_article["related_symbols"] = ls_related_symbols if not pd.isna(dict_row.get("comments", np.NaN)): dict_article[ "seeking_alpha_extra"] = SeekingAlphaArticleExtra( comments=dict_row["comments"]) dict_article["channel_in_site"] = kw dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) # endregion return rlt_articles