def kw_search(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]: rlt_articles: List[Article] = list() rlt_files = glob.glob(os.path.join(rlt_path, "*_kw_search_result.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) kw, df_rlt = SeekingAlphaDataProcess.proc_kw_search( df_from_csv) # print(df_rlt) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("news_url", np.NaN)): article_url = furl(dict_row["news_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况 if abs_url_str.find("seekingalpha") > 0: dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) else: dict_article["full_text_url"] = article_url.url dict_article["uuid"] = md5_str(article_url.url) if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() if not pd.isna(dict_row.get("symbols", np.NaN)): symbols = [ x.strip() for x in dict_row.get("symbols").split(",") ] ls_related_symbols: List[FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=x) for x in symbols ] if save_doc: map(upsert_document, ls_related_symbols) dict_article["related_symbols"] = ls_related_symbols dict_article["from_searching_phase"] = SearchingPhrase( searching_phrase=kw) dict_article["engine_site"] = "SeekingAlpha" dict_article["channel_in_site"] = "Search" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) return rlt_articles
def kw_news_search(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]: rlt_articles: List[Article] = list() rlt_files = glob.glob(os.path.join(rlt_path, "*_index_items.csv")) if rlt_files: for i, f in enumerate(rlt_files): try: df_from_csv = pd.read_csv(f, header=0, parse_dates=["publish_time"]) except: continue kw, df_rlt = GoogleNewsSearchProcess.proc_news_search_data(df_from_csv) if df_rlt is None: continue logger.info(f"proecess file : {f} - {df_rlt.shape}") # print(df_rlt) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("news_title", np.NaN)): dict_article["title"] = dict_row["news_title"] if not pd.isna(dict_row.get("url", np.NaN)): dict_article["full_text_url"] = dict_row["url"] if not pd.isna(dict_row.get("news_abstract", np.NaN)): dict_article["abstract"] = dict_row["news_abstract"] if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row["publish_time"].to_pydatetime() search_phrase_in_db = SearchingPhrase.objects(searching_phrase=kw).first() if search_phrase_in_db is not None: dict_article["from_searching_phase"] = search_phrase_in_db if search_phrase_in_db.related_symbols is not None: dict_article["related_symbols"] = search_phrase_in_db.related_symbols else: dict_article["from_searching_phase"] = SearchingPhrase(searching_phrase=kw) dict_article["engine_site"] = "google_news" if not pd.isna(dict_row.get("publisher", np.NaN)): dict_article["channel_in_site"] = dict_row["publisher"] dict_article["uuid"] = md5_str(f"{dict_article['channel_in_site']}-{dict_article['title']}") else: # 暂定没有 publisher 的 news 不入库 continue dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) return rlt_articles
def process_action_result(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True): dict_extract_info: Dict[str, str] = {} extract_info_files = glob.glob( os.path.join(rlt_path, "*_extract_info.js")) if extract_info_files: with open(extract_info_files[0], "r") as f: dict_extract_info = json.load(f) article_files = glob.glob(os.path.join(rlt_path, "*_articles.csv")) ls_all_article_uuid: List[str] = list() """保存所有的 uuid ,用于到 Mongo 中查询哪些是已经 Insert 的""" ls_all_article_doc_dict: List[Dict] = list() """先保存到一个 list , 然后统一 upsert document """ if article_files: df_articles = pd.read_csv(article_files[0], header=0) for idx, row in df_articles.iterrows(): dict_row = row.to_dict() dict_article = dict() # "title" = > Article.title , 一般不做特殊处理,有则填入 if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] # "abstract" = > Article.abstract, 摘要,一般不做特殊处理 if not pd.isna(dict_row.get("abstract", np.NaN)): dict_article["abstract"] = dict_row["abstract"] # "channel_in_site" = > Article.channel_in_site, 二级分类,站点内的 if not pd.isna(dict_row.get("channel_in_site", np.NaN)): dict_article["channel_in_site"] = dict_row[ "channel_in_site"] # "full_text_url" = > Article.full_text_url , 正文的链接,需要做地址转换 if not pd.isna(dict_row.get("full_text_url", np.NaN)): full_text_url = dict_row["full_text_url"] # 尝试拼接 site page_url = dict_extract_info.get("real_url", "") if page_url: full_text_url = append_site_to_url( full_text_url, furl(page_url).origin) dict_article["full_text_url"] = full_text_url # "publish_time" = > Article.publish_time, 可能需要有数据预处理 if not pd.isna(dict_row.get("publish_time", np.NaN)): s_time = dict_row["publish_time"] s_time_preprocess = dict_extract_info.get( "publish_time_preprocess", "") if s_time_preprocess: assert s_time_preprocess in TIME_STR_PREPROCESS s_time = TIME_STR_PREPROCESS[s_time_preprocess](s_time) dt_time = _parse_date(s_time) if dt_time: dict_article["publish_time"] = dt_time if "article_site" in dict_extract_info: dict_article["engine_site"] = dict_extract_info[ "article_site"] # 生成 uuid if "full_text_url" in dict_article: # 优先考虑 url 作为 uuid dict_article["uuid"] = md5_str( dict_article["full_text_url"]) elif "title" in dict_article and "engine_site" in dict_article: # 标题+site作为 uuid # note: publish_time 不能计算 MD5 , 因为一些比较模糊的字符串,比如 5h 在不同的时间解析,会得到不一致的内容 dict_article["uuid"] = md5_str( f"{dict_article['engine_site']}-{dict_article['title']}" ) else: continue dict_article["action_uuid"] = action_uuid dict_article["batch_action_uuid"] = batch_action_uuid ls_all_article_uuid.append(dict_article["uuid"]) ls_all_article_doc_dict.append(dict_article) # TODO: 查询数据库中是否存在该项内容,用于标记 new / existed all_exist_docs = Article.objects( uuid__in=ls_all_article_uuid).only("uuid") set_existed_uuid = set([doc.uuid for doc in all_exist_docs]) # 更新 db related_articles: List[Article] = list() new_found_articles: List[Article] = list() for article_doc_dict in ls_all_article_doc_dict: related_articles.append(Article(uuid=article_doc_dict["uuid"])) if article_doc_dict["uuid"] in set_existed_uuid: logger.info( f"Doc '{article_doc_dict['uuid']}' is existed already.") continue new_found_articles.append(Article(uuid=article_doc_dict["uuid"])) if save_doc: upsert_document(Article(**article_doc_dict), True) logger.info( f"Action '{action_uuid}' found {len(new_found_articles)} new articles" ) # 存 img 和 pdf pdf_screen_files = glob.glob( os.path.join(rlt_path, "*_webpage_content.pdf")) orig_pdf_bin_uuid: str = None if pdf_screen_files: with open(pdf_screen_files[0], "rb") as f: bin_pdf = f.read() bin_pdf_gz = gzip.compress(bin_pdf) orig_pdf_bin_uuid = md5_binary(bin_pdf) if save_doc: pdf_attach = BinaryAttachment(uuid=orig_pdf_bin_uuid, bin_data=bin_pdf_gz, file_ext="pdf", is_gzip=True, bin_length=len(bin_pdf_gz), ctime=datetime.now(), action_uuid=action_uuid) upsert_document(pdf_attach, False) png_screenshot_files = glob.glob( os.path.join(rlt_path, "*_browser_sceenshot.png")) orig_png_bin_uuid: str = None if pdf_screen_files: with open(png_screenshot_files[0], "rb") as f: bin_png = f.read() bin_png_gz = gzip.compress(bin_png) orig_png_bin_uuid = md5_binary(bin_png) if save_doc: png_attach = BinaryAttachment(uuid=orig_png_bin_uuid, bin_data=bin_png_gz, file_ext="png", is_gzip=True, bin_length=len(bin_png_gz), ctime=datetime.now(), action_uuid=action_uuid) upsert_document(png_attach, False) # 存 GeneralBrowserActionInstance if save_doc: general_browser_action_doc = GeneralBrowserActionInstance( uuid=action_uuid, related_articles=related_articles, new_found_articles=new_found_articles, pdf_page_snapshot=BinaryAttachment(uuid=orig_pdf_bin_uuid), img_page_snapshot=BinaryAttachment(uuid=orig_png_bin_uuid)) upsert_document(general_browser_action_doc, False)
def kw_search( rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True ) -> Tuple[List[Article], List[UserInTwitter]]: rlt_articles: List[Article] = list() rlt_posters: List[UserInTwitter] = list() rlt_files = glob.glob(os.path.join(rlt_path, "*_posts.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv( f, header=0, parse_dates=["post_time", "extract_t"]) kw, df_rlt = TwitterDataProcess.proc_posts(df_from_csv) # print(kw) # print(df_rlt) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("post_content", np.NaN)): dict_article["title"] = dict_row["post_content"] if not pd.isna(dict_row.get("post_content_detail", np.NaN)): dict_article["abstract"] = dict_row[ "post_content_detail"] if not pd.isna(dict_row.get("post_additional_url", np.NaN)): dict_article["full_text_url"] = dict_row[ "post_additional_url"] if not pd.isna(dict_row.get("post_image_url", np.NaN)): dict_article["related_image_url"] = dict_row[ "post_image_url"] if dict_row.get("post_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "post_time"].to_pydatetime() if not pd.isna(dict_row.get("poster_name", np.NaN)): poster = UserInTwitter(user_id=dict_row["poster_id"], name=dict_row["poster_name"]) else: poster = UserInTwitter(user_id=dict_row["poster_id"]) dict_article["twitter_poster"] = poster # uuid 的计算规则为 posterid + post_time dict_article["uuid"] = md5_str( f"{poster.user_id}|{dict_article['publish_time'].isoformat()}" ) if not pd.isna(dict_row.get( "comments", np.NaN)) or not pd.isna( dict_row.get( "retweet", np.NaN)) or not pd.isna( dict_row.get("retweet", np.NaN)): extra_data = TweetExtra() if not pd.isna(dict_row.get("comments", np.NaN)): extra_data.comments = int(dict_row["comments"]) if not pd.isna(dict_row.get("retweet", np.NaN)): extra_data.retweet = int(dict_row["retweet"]) if not pd.isna(dict_row.get("like", np.NaN)): extra_data.like = int(dict_row["like"]) dict_article["tweet_extra"] = extra_data search_phrase_in_db = SearchingPhrase.objects( searching_phrase=kw).first() if search_phrase_in_db is not None: dict_article[ "from_searching_phase"] = search_phrase_in_db if search_phrase_in_db.related_symbols is not None: dict_article[ "related_symbols"] = search_phrase_in_db.related_symbols else: dict_article["from_searching_phase"] = SearchingPhrase( searching_phrase=kw) dict_article["engine_site"] = "Twitter" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) rlt_files = glob.glob( os.path.join(rlt_path, "*_follower_following.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) t, kw, df_rlt = TwitterDataProcess.proc_follower_following_info( df_from_csv, "search_phase") # print(kw) # print(df_rlt) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() twitter_user = UserInTwitter(user_id=dict_row["poster_id"]) if not pd.isna(dict_row.get("following", np.NaN)): twitter_user.following = int(dict_row["following"]) if not pd.isna(dict_row.get("follower", np.NaN)): twitter_user.follower = int(dict_row["follower"]) twitter_user.mtime = t twitter_user.batch_action_uuid = batch_action_uuid rlt_posters.append(twitter_user) if save_doc: upsert_document(twitter_user, True) return rlt_articles, rlt_posters
def symbol_analysis_report(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]: rlt_articles: List[Article] = list() rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0) df_rlt = EastMoneyDataProcess.proc_stock_analysis(df_from_csv) if df_rlt is not None: for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("article", np.NaN)): dict_article["title"] = dict_row["article"] if not pd.isna(dict_row.get("article_url", np.NaN)): dict_article["full_text_url"] = dict_row[ "article_url"] dict_article["uuid"] = md5_str( dict_row["article_url"]) if not pd.isna(dict_row.get("org_url", np.NaN)): # 暂时先把机构名直接存在 seeking alpha 的 author 数据中,这样画图方便一些 author_url: str = dict_row["org_url"] author_id = dict_row["org"] # NO author_name extracted!! # author_name = None # if not pd.isna(dict_row.get("author", np.NaN)): # author_name = dict_row["author"] author = AuthorInSeekingAlpha(author_id=author_id, url=author_url) if not pd.isna( dict_row.get("reports_within_one_month", np.NaN)): author.articles = dict_row[ "reports_within_one_month"] dict_article["seeking_alpha_author"] = author if save_doc: upsert_document(author, True) if not pd.isna(dict_row.get("rating", np.NaN)): dict_article["rating"] = dict_row["rating"] if not pd.isna(dict_row.get("rating_chg", np.NaN)): dict_article["rating_change"] = dict_row[ "rating_chg"] if not pd.isna(dict_row.get("pred_2020_ret", np.NaN)): dict_article["pred_ret_this_yr"] = dict_row[ "pred_2020_ret"] if not pd.isna(dict_row.get("pred_2020_pe", np.NaN)): dict_article["pred_pe_this_yr"] = dict_row[ "pred_2020_pe"] if not pd.isna(dict_row.get("pred_2021_pe", np.NaN)): dict_article["pred_pe_next_yr"] = dict_row[ "pred_2021_pe"] if not pd.isna(dict_row.get("pred_2021_ret", np.NaN)): dict_article["pred_ret_next_yr"] = dict_row[ "pred_2021_ret"] if dict_row.get("report_date", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "report_date"].to_pydatetime() symbol = dict_row["symbol"] ls_related_symbols: List[FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=symbol) ] if save_doc: ls_related_symbols[0].full_name = dict_row[ "symbol_name"] upsert_document(ls_related_symbols[0]) dict_article["related_symbols"] = ls_related_symbols dict_article["engine_site"] = "EastMoney" dict_article["channel_in_site"] = "StockAnalysis" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) return rlt_articles
def symbol_summary( rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True ) -> Tuple[List[FinancialInstrumentSymbol], List[Article]]: rlt_articles: List[Article] = list() rlt_symbols: List[FinancialInstrumentSymbol] = list() # region symbol analysis rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_analysis( df_from_csv) if df_rlt is not None: for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("article_url", np.NaN)): article_url = furl(dict_row["article_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) if not pd.isna(dict_row.get("author_url", np.NaN)): author_url: str = dict_row["author_url"] author_id = dict_row["author_id"] # NO author_name extracted!! # author_name = None # if not pd.isna(dict_row.get("author", np.NaN)): # author_name = dict_row["author"] author = AuthorInSeekingAlpha(author_id=author_id, url=author_url) dict_article["seeking_alpha_author"] = author if save_doc: upsert_document(author, True) if not pd.isna(dict_row.get("rating", np.NaN)): dict_article["rating"] = dict_row["rating"] if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() if not pd.isna(dict_row.get("comments", np.NaN)): dict_article[ "seeking_alpha_extra"] = SeekingAlphaArticleExtra( comments=dict_row["comments"]) ls_related_symbols: List[FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=symbol) ] if save_doc: upsert_document(ls_related_symbols[0]) dict_article["related_symbols"] = ls_related_symbols dict_article["engine_site"] = "SeekingAlpha" dict_article["channel_in_site"] = "analysis" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) # endregion # region symbol news rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_news.csv")) for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_news( df_from_csv) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("news_url", np.NaN)): article_url = furl(dict_row["news_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况 if abs_url_str.find("seekingalpha") > 0: dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) else: dict_article["full_text_url"] = article_url.url dict_article["uuid"] = md5_str(article_url.url) if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() if not pd.isna(dict_row.get("comments", np.NaN)): dict_article[ "seeking_alpha_extra"] = SeekingAlphaArticleExtra( comments=dict_row["comments"]) ls_related_symbols: List[FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=symbol) ] if save_doc: upsert_document(ls_related_symbols[0]) dict_article["related_symbols"] = ls_related_symbols dict_article["engine_site"] = "SeekingAlpha" dict_article["batch_action_uuid"] = batch_action_uuid if not pd.isna(dict_row.get("orig_source", np.NaN)): dict_article["channel_in_site"] = dict_row["orig_source"] article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) # endregion # region symbol info rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_indicators.csv")) if rlt_files: for f in rlt_files: logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0) dict_symbol_info = SeekingAlphaDataProcess.proc_symbol_indicator( df_from_csv) if not dict_symbol_info: continue if "symbol" not in dict_symbol_info: continue symbol = FinancialInstrumentSymbol( symbol=dict_symbol_info.get("symbol"), info_from_seeking_alpha=SymbolInfoBySeekingAlpha( followers=dict_symbol_info.get("followers", None), high_52wk=dict_symbol_info.get("52wk high", None), low_52wk=dict_symbol_info.get("52wk low", None), eps_fwd=dict_symbol_info.get("EPS (FWD)", None), pe_fwd=dict_symbol_info.get("PE (FWD)", None), yield_fwd=dict_symbol_info.get("Yield (FWD)", None), div_rate_fwd=dict_symbol_info.get( "Div Rate (FWD)", None), mkt_cap=dict_symbol_info.get("Market Cap", None), volume=dict_symbol_info.get("Volume", None), mtime=datetime.now())) if save_doc: upsert_document(symbol, True) rlt_symbols.append(symbol) # endregion return rlt_symbols, rlt_articles
def author_detail( rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True ) -> Tuple[List[AuthorInSeekingAlpha], List[Article]]: rlt_articles: List[Article] = list() rlt_authors: List[AuthorInSeekingAlpha] = list() # region author articles rlt_files = glob.glob(os.path.join(rlt_path, "*_author_articles.csv")) if rlt_files: for i, f in enumerate(rlt_files): logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) author_id, df_rlt = SeekingAlphaDataProcess.proc_author_articles( df_from_csv) if df_rlt is not None: for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("news_url", np.NaN)): article_url = furl(dict_row["news_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() if not pd.isna(dict_row.get("comments", np.NaN)): dict_article[ "seeking_alpha_extra"] = SeekingAlphaArticleExtra( comments=dict_row["comments"]) if not pd.isna(dict_row.get("symbols", np.NaN)): ls_related_symbols: List[ FinancialInstrumentSymbol] = [ FinancialInstrumentSymbol(symbol=s.strip()) for s in dict_row["symbols"].split(",") ] if ls_related_symbols: for symbol in ls_related_symbols: if save_doc: upsert_document(symbol, True) dict_article[ "related_symbols"] = ls_related_symbols dict_article["engine_site"] = "SeekingAlpha" dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) # endregion # region author info rlt_files = glob.glob(os.path.join(rlt_path, "*_author_info.csv")) if rlt_files: for f in rlt_files: logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0) dict_author_info = SeekingAlphaDataProcess.proc_author_info( df_from_csv) if not dict_author_info: continue if "author" not in dict_author_info: continue author = AuthorInSeekingAlpha( author_id=dict_author_info.get("author"), intro=dict_author_info.get("author_intro", ""), articles=dict_author_info.get("articles", None), picks=dict_author_info.get("authors_picks", None), blog_posts=dict_author_info.get("instablogs", None), comments=dict_author_info.get("comments", None), stock_talks=dict_author_info.get("stocktalks", None), likes=dict_author_info.get("likes", None), followers=dict_author_info.get("followers", None), following=dict_author_info.get("following", None), mtime=datetime.now(), batch_action_uuid=batch_action_uuid) if save_doc: upsert_document(author, True) rlt_authors.append(author) # endregion return rlt_authors, rlt_articles
def column_articles(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]: rlt_articles = list() # region articles rlt_files = glob.glob(os.path.join(rlt_path, "*_articles.csv")) if rlt_files: for f in rlt_files: logger.info(f"proecess file : {f} ") df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"]) kw, df_rlt = SeekingAlphaDataProcess.proc_article_data( df_from_csv) for idx, row in df_rlt.iterrows(): dict_row = row.to_dict() dict_article = dict() # for Article if not pd.isna(dict_row.get("title", np.NaN)): dict_article["title"] = dict_row["title"] if not pd.isna(dict_row.get("author_url", np.NaN)): author_url: str = dict_row["author_url"] author_id = author_url.split("/")[-1] dict_article[ "seeking_alpha_author"] = AuthorInSeekingAlpha( author_id=author_id, url=author_url) if not pd.isna(dict_row.get("article_url", np.NaN)): article_url = furl(dict_row["article_url"]) abs_url_str = f"{article_url.origin}{article_url.path}" dict_article["full_text_url"] = abs_url_str dict_article["uuid"] = md5_str(abs_url_str) if dict_row.get("publish_time", pd.NaT) is not pd.NaT: dict_article["publish_time"] = dict_row[ "publish_time"].to_pydatetime() dict_article["engine_site"] = "SeekingAlpha" ls_related_symbols: List[FinancialInstrumentSymbol] = list( ) for symbol_key_pair in [ ("related_symbol1", "related_symbol1_fullname"), ("related_symbol2", "related_symbol2_fullname"), ("related_symbol3", "related_symbol3_fullname") ]: if not pd.isna(dict_row.get( symbol_key_pair[0], np.NaN)) and not pd.isna( dict_row.get(symbol_key_pair[1], np.NaN)): fin_instrument_symbol = FinancialInstrumentSymbol( symbol=dict_row[symbol_key_pair[0]], full_name=dict_row[symbol_key_pair[1]], batch_action_uuid=batch_action_uuid) ls_related_symbols.append(fin_instrument_symbol) # ListField(ReferenceField(FinancialInstrumentSymbol)) 似乎不会级联保存,这里创建的时候同时保存 if save_doc: upsert_document(fin_instrument_symbol, True) if ls_related_symbols: dict_article["related_symbols"] = ls_related_symbols if not pd.isna(dict_row.get("comments", np.NaN)): dict_article[ "seeking_alpha_extra"] = SeekingAlphaArticleExtra( comments=dict_row["comments"]) dict_article["channel_in_site"] = kw dict_article["batch_action_uuid"] = batch_action_uuid article = Article(**dict_article) rlt_articles.append(article) if save_doc: upsert_document(article, True) # endregion return rlt_articles