Beispiel #1
0
    def kw_search(rlt_path: str,
                  batch_action_uuid: str,
                  action_uuid: str,
                  save_doc: bool = True) -> List[Article]:
        rlt_articles: List[Article] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_kw_search_result.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                kw, df_rlt = SeekingAlphaDataProcess.proc_kw_search(
                    df_from_csv)
                # print(df_rlt)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()
                    if not pd.isna(dict_row.get("title", np.NaN)):
                        dict_article["title"] = dict_row["title"]
                    if not pd.isna(dict_row.get("news_url", np.NaN)):
                        article_url = furl(dict_row["news_url"])
                        abs_url_str = f"{article_url.origin}{article_url.path}"
                        # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况
                        if abs_url_str.find("seekingalpha") > 0:
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        else:
                            dict_article["full_text_url"] = article_url.url
                            dict_article["uuid"] = md5_str(article_url.url)
                    if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row[
                            "publish_time"].to_pydatetime()
                    if not pd.isna(dict_row.get("symbols", np.NaN)):
                        symbols = [
                            x.strip()
                            for x in dict_row.get("symbols").split(",")
                        ]
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=x)
                            for x in symbols
                        ]
                        if save_doc:
                            map(upsert_document, ls_related_symbols)
                        dict_article["related_symbols"] = ls_related_symbols
                    dict_article["from_searching_phase"] = SearchingPhrase(
                        searching_phrase=kw)
                    dict_article["engine_site"] = "SeekingAlpha"
                    dict_article["channel_in_site"] = "Search"
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)

        return rlt_articles
Beispiel #2
0
    def kw_news_search(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]:
        rlt_articles: List[Article] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_index_items.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                try:
                    df_from_csv = pd.read_csv(f, header=0, parse_dates=["publish_time"])
                except:
                    continue
                kw, df_rlt = GoogleNewsSearchProcess.proc_news_search_data(df_from_csv)
                if df_rlt is None:
                    continue
                logger.info(f"proecess file : {f} - {df_rlt.shape}")
                # print(df_rlt)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()
                    if not pd.isna(dict_row.get("news_title", np.NaN)):
                        dict_article["title"] = dict_row["news_title"]
                    if not pd.isna(dict_row.get("url", np.NaN)):
                        dict_article["full_text_url"] = dict_row["url"]
                    if not pd.isna(dict_row.get("news_abstract", np.NaN)):
                        dict_article["abstract"] = dict_row["news_abstract"]
                    if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row["publish_time"].to_pydatetime()

                    search_phrase_in_db = SearchingPhrase.objects(searching_phrase=kw).first()
                    if search_phrase_in_db is not None:
                        dict_article["from_searching_phase"] = search_phrase_in_db
                        if search_phrase_in_db.related_symbols is not None:
                            dict_article["related_symbols"] = search_phrase_in_db.related_symbols
                    else:
                        dict_article["from_searching_phase"] = SearchingPhrase(searching_phrase=kw)

                    dict_article["engine_site"] = "google_news"
                    if not pd.isna(dict_row.get("publisher", np.NaN)):
                        dict_article["channel_in_site"] = dict_row["publisher"]
                        dict_article["uuid"] = md5_str(f"{dict_article['channel_in_site']}-{dict_article['title']}")
                    else:  # 暂定没有 publisher 的 news 不入库
                        continue
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)

        return rlt_articles
    def process_action_result(rlt_path: str,
                              batch_action_uuid: str,
                              action_uuid: str,
                              save_doc: bool = True):
        dict_extract_info: Dict[str, str] = {}

        extract_info_files = glob.glob(
            os.path.join(rlt_path, "*_extract_info.js"))
        if extract_info_files:
            with open(extract_info_files[0], "r") as f:
                dict_extract_info = json.load(f)

        article_files = glob.glob(os.path.join(rlt_path, "*_articles.csv"))
        ls_all_article_uuid: List[str] = list()
        """保存所有的 uuid ,用于到 Mongo 中查询哪些是已经 Insert 的"""
        ls_all_article_doc_dict: List[Dict] = list()
        """先保存到一个 list , 然后统一 upsert document """

        if article_files:
            df_articles = pd.read_csv(article_files[0], header=0)
            for idx, row in df_articles.iterrows():
                dict_row = row.to_dict()
                dict_article = dict()
                # "title" = > Article.title , 一般不做特殊处理,有则填入
                if not pd.isna(dict_row.get("title", np.NaN)):
                    dict_article["title"] = dict_row["title"]
                # "abstract" = > Article.abstract, 摘要,一般不做特殊处理
                if not pd.isna(dict_row.get("abstract", np.NaN)):
                    dict_article["abstract"] = dict_row["abstract"]
                # "channel_in_site" = > Article.channel_in_site, 二级分类,站点内的
                if not pd.isna(dict_row.get("channel_in_site", np.NaN)):
                    dict_article["channel_in_site"] = dict_row[
                        "channel_in_site"]
                # "full_text_url" = > Article.full_text_url , 正文的链接,需要做地址转换
                if not pd.isna(dict_row.get("full_text_url", np.NaN)):
                    full_text_url = dict_row["full_text_url"]
                    # 尝试拼接 site
                    page_url = dict_extract_info.get("real_url", "")
                    if page_url:
                        full_text_url = append_site_to_url(
                            full_text_url,
                            furl(page_url).origin)
                    dict_article["full_text_url"] = full_text_url
                # "publish_time" = > Article.publish_time, 可能需要有数据预处理
                if not pd.isna(dict_row.get("publish_time", np.NaN)):
                    s_time = dict_row["publish_time"]
                    s_time_preprocess = dict_extract_info.get(
                        "publish_time_preprocess", "")
                    if s_time_preprocess:
                        assert s_time_preprocess in TIME_STR_PREPROCESS
                        s_time = TIME_STR_PREPROCESS[s_time_preprocess](s_time)
                    dt_time = _parse_date(s_time)
                    if dt_time:
                        dict_article["publish_time"] = dt_time
                if "article_site" in dict_extract_info:
                    dict_article["engine_site"] = dict_extract_info[
                        "article_site"]
                # 生成 uuid
                if "full_text_url" in dict_article:  # 优先考虑 url 作为 uuid
                    dict_article["uuid"] = md5_str(
                        dict_article["full_text_url"])
                elif "title" in dict_article and "engine_site" in dict_article:  # 标题+site作为 uuid
                    # note: publish_time 不能计算 MD5 , 因为一些比较模糊的字符串,比如 5h 在不同的时间解析,会得到不一致的内容
                    dict_article["uuid"] = md5_str(
                        f"{dict_article['engine_site']}-{dict_article['title']}"
                    )
                else:
                    continue
                dict_article["action_uuid"] = action_uuid
                dict_article["batch_action_uuid"] = batch_action_uuid
                ls_all_article_uuid.append(dict_article["uuid"])
                ls_all_article_doc_dict.append(dict_article)

        # TODO: 查询数据库中是否存在该项内容,用于标记 new / existed
        all_exist_docs = Article.objects(
            uuid__in=ls_all_article_uuid).only("uuid")
        set_existed_uuid = set([doc.uuid for doc in all_exist_docs])

        # 更新 db
        related_articles: List[Article] = list()
        new_found_articles: List[Article] = list()
        for article_doc_dict in ls_all_article_doc_dict:
            related_articles.append(Article(uuid=article_doc_dict["uuid"]))
            if article_doc_dict["uuid"] in set_existed_uuid:
                logger.info(
                    f"Doc '{article_doc_dict['uuid']}' is existed already.")
                continue
            new_found_articles.append(Article(uuid=article_doc_dict["uuid"]))
            if save_doc:
                upsert_document(Article(**article_doc_dict), True)
        logger.info(
            f"Action '{action_uuid}' found {len(new_found_articles)} new articles"
        )

        # 存 img 和 pdf
        pdf_screen_files = glob.glob(
            os.path.join(rlt_path, "*_webpage_content.pdf"))
        orig_pdf_bin_uuid: str = None
        if pdf_screen_files:
            with open(pdf_screen_files[0], "rb") as f:
                bin_pdf = f.read()
            bin_pdf_gz = gzip.compress(bin_pdf)
            orig_pdf_bin_uuid = md5_binary(bin_pdf)
            if save_doc:
                pdf_attach = BinaryAttachment(uuid=orig_pdf_bin_uuid,
                                              bin_data=bin_pdf_gz,
                                              file_ext="pdf",
                                              is_gzip=True,
                                              bin_length=len(bin_pdf_gz),
                                              ctime=datetime.now(),
                                              action_uuid=action_uuid)
                upsert_document(pdf_attach, False)

        png_screenshot_files = glob.glob(
            os.path.join(rlt_path, "*_browser_sceenshot.png"))
        orig_png_bin_uuid: str = None
        if pdf_screen_files:
            with open(png_screenshot_files[0], "rb") as f:
                bin_png = f.read()
            bin_png_gz = gzip.compress(bin_png)
            orig_png_bin_uuid = md5_binary(bin_png)
            if save_doc:
                png_attach = BinaryAttachment(uuid=orig_png_bin_uuid,
                                              bin_data=bin_png_gz,
                                              file_ext="png",
                                              is_gzip=True,
                                              bin_length=len(bin_png_gz),
                                              ctime=datetime.now(),
                                              action_uuid=action_uuid)
                upsert_document(png_attach, False)

        # 存 GeneralBrowserActionInstance
        if save_doc:
            general_browser_action_doc = GeneralBrowserActionInstance(
                uuid=action_uuid,
                related_articles=related_articles,
                new_found_articles=new_found_articles,
                pdf_page_snapshot=BinaryAttachment(uuid=orig_pdf_bin_uuid),
                img_page_snapshot=BinaryAttachment(uuid=orig_png_bin_uuid))
            upsert_document(general_browser_action_doc, False)
Beispiel #4
0
    def kw_search(
            rlt_path: str,
            batch_action_uuid: str,
            action_uuid: str,
            save_doc: bool = True
    ) -> Tuple[List[Article], List[UserInTwitter]]:
        rlt_articles: List[Article] = list()
        rlt_posters: List[UserInTwitter] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_posts.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(
                    f, header=0, parse_dates=["post_time", "extract_t"])
                kw, df_rlt = TwitterDataProcess.proc_posts(df_from_csv)
                # print(kw)
                # print(df_rlt)

                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()
                    if not pd.isna(dict_row.get("post_content", np.NaN)):
                        dict_article["title"] = dict_row["post_content"]
                    if not pd.isna(dict_row.get("post_content_detail",
                                                np.NaN)):
                        dict_article["abstract"] = dict_row[
                            "post_content_detail"]
                    if not pd.isna(dict_row.get("post_additional_url",
                                                np.NaN)):
                        dict_article["full_text_url"] = dict_row[
                            "post_additional_url"]
                    if not pd.isna(dict_row.get("post_image_url", np.NaN)):
                        dict_article["related_image_url"] = dict_row[
                            "post_image_url"]
                    if dict_row.get("post_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row[
                            "post_time"].to_pydatetime()
                    if not pd.isna(dict_row.get("poster_name", np.NaN)):
                        poster = UserInTwitter(user_id=dict_row["poster_id"],
                                               name=dict_row["poster_name"])
                    else:
                        poster = UserInTwitter(user_id=dict_row["poster_id"])
                    dict_article["twitter_poster"] = poster
                    # uuid 的计算规则为 posterid + post_time
                    dict_article["uuid"] = md5_str(
                        f"{poster.user_id}|{dict_article['publish_time'].isoformat()}"
                    )
                    if not pd.isna(dict_row.get(
                            "comments", np.NaN)) or not pd.isna(
                                dict_row.get(
                                    "retweet", np.NaN)) or not pd.isna(
                                        dict_row.get("retweet", np.NaN)):
                        extra_data = TweetExtra()
                        if not pd.isna(dict_row.get("comments", np.NaN)):
                            extra_data.comments = int(dict_row["comments"])
                        if not pd.isna(dict_row.get("retweet", np.NaN)):
                            extra_data.retweet = int(dict_row["retweet"])
                        if not pd.isna(dict_row.get("like", np.NaN)):
                            extra_data.like = int(dict_row["like"])
                        dict_article["tweet_extra"] = extra_data

                    search_phrase_in_db = SearchingPhrase.objects(
                        searching_phrase=kw).first()
                    if search_phrase_in_db is not None:
                        dict_article[
                            "from_searching_phase"] = search_phrase_in_db
                        if search_phrase_in_db.related_symbols is not None:
                            dict_article[
                                "related_symbols"] = search_phrase_in_db.related_symbols
                    else:
                        dict_article["from_searching_phase"] = SearchingPhrase(
                            searching_phrase=kw)
                    dict_article["engine_site"] = "Twitter"
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)

        rlt_files = glob.glob(
            os.path.join(rlt_path, "*_follower_following.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                t, kw, df_rlt = TwitterDataProcess.proc_follower_following_info(
                    df_from_csv, "search_phase")
                # print(kw)
                # print(df_rlt)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    twitter_user = UserInTwitter(user_id=dict_row["poster_id"])
                    if not pd.isna(dict_row.get("following", np.NaN)):
                        twitter_user.following = int(dict_row["following"])
                    if not pd.isna(dict_row.get("follower", np.NaN)):
                        twitter_user.follower = int(dict_row["follower"])
                    twitter_user.mtime = t
                    twitter_user.batch_action_uuid = batch_action_uuid
                    rlt_posters.append(twitter_user)
                    if save_doc:
                        upsert_document(twitter_user, True)
        return rlt_articles, rlt_posters
Beispiel #5
0
    def symbol_analysis_report(rlt_path: str,
                               batch_action_uuid: str,
                               action_uuid: str,
                               save_doc: bool = True) -> List[Article]:

        rlt_articles: List[Article] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                df_rlt = EastMoneyDataProcess.proc_stock_analysis(df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("article", np.NaN)):
                            dict_article["title"] = dict_row["article"]
                        if not pd.isna(dict_row.get("article_url", np.NaN)):
                            dict_article["full_text_url"] = dict_row[
                                "article_url"]
                            dict_article["uuid"] = md5_str(
                                dict_row["article_url"])

                        if not pd.isna(dict_row.get("org_url", np.NaN)):
                            # 暂时先把机构名直接存在 seeking alpha 的 author 数据中,这样画图方便一些
                            author_url: str = dict_row["org_url"]
                            author_id = dict_row["org"]
                            # NO author_name extracted!!
                            # author_name = None
                            # if not pd.isna(dict_row.get("author", np.NaN)):
                            #     author_name = dict_row["author"]
                            author = AuthorInSeekingAlpha(author_id=author_id,
                                                          url=author_url)
                            if not pd.isna(
                                    dict_row.get("reports_within_one_month",
                                                 np.NaN)):
                                author.articles = dict_row[
                                    "reports_within_one_month"]

                            dict_article["seeking_alpha_author"] = author
                            if save_doc:
                                upsert_document(author, True)
                        if not pd.isna(dict_row.get("rating", np.NaN)):
                            dict_article["rating"] = dict_row["rating"]
                        if not pd.isna(dict_row.get("rating_chg", np.NaN)):
                            dict_article["rating_change"] = dict_row[
                                "rating_chg"]
                        if not pd.isna(dict_row.get("pred_2020_ret", np.NaN)):
                            dict_article["pred_ret_this_yr"] = dict_row[
                                "pred_2020_ret"]
                        if not pd.isna(dict_row.get("pred_2020_pe", np.NaN)):
                            dict_article["pred_pe_this_yr"] = dict_row[
                                "pred_2020_pe"]
                        if not pd.isna(dict_row.get("pred_2021_pe", np.NaN)):
                            dict_article["pred_pe_next_yr"] = dict_row[
                                "pred_2021_pe"]
                        if not pd.isna(dict_row.get("pred_2021_ret", np.NaN)):
                            dict_article["pred_ret_next_yr"] = dict_row[
                                "pred_2021_ret"]
                        if dict_row.get("report_date", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "report_date"].to_pydatetime()

                        symbol = dict_row["symbol"]
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=symbol)
                        ]
                        if save_doc:
                            ls_related_symbols[0].full_name = dict_row[
                                "symbol_name"]
                            upsert_document(ls_related_symbols[0])
                        dict_article["related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "EastMoney"
                        dict_article["channel_in_site"] = "StockAnalysis"
                        dict_article["batch_action_uuid"] = batch_action_uuid

                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        return rlt_articles
Beispiel #6
0
    def symbol_summary(
        rlt_path: str,
        batch_action_uuid: str,
        action_uuid: str,
        save_doc: bool = True
    ) -> Tuple[List[FinancialInstrumentSymbol], List[Article]]:
        rlt_articles: List[Article] = list()
        rlt_symbols: List[FinancialInstrumentSymbol] = list()

        # region symbol analysis
        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_analysis(
                    df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("title", np.NaN)):
                            dict_article["title"] = dict_row["title"]
                        if not pd.isna(dict_row.get("article_url", np.NaN)):
                            article_url = furl(dict_row["article_url"])
                            abs_url_str = f"{article_url.origin}{article_url.path}"
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        if not pd.isna(dict_row.get("author_url", np.NaN)):
                            author_url: str = dict_row["author_url"]
                            author_id = dict_row["author_id"]
                            # NO author_name extracted!!
                            # author_name = None
                            # if not pd.isna(dict_row.get("author", np.NaN)):
                            #     author_name = dict_row["author"]
                            author = AuthorInSeekingAlpha(author_id=author_id,
                                                          url=author_url)
                            dict_article["seeking_alpha_author"] = author
                            if save_doc:
                                upsert_document(author, True)
                        if not pd.isna(dict_row.get("rating", np.NaN)):
                            dict_article["rating"] = dict_row["rating"]

                        if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "publish_time"].to_pydatetime()
                        if not pd.isna(dict_row.get("comments", np.NaN)):
                            dict_article[
                                "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                    comments=dict_row["comments"])
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=symbol)
                        ]
                        if save_doc:
                            upsert_document(ls_related_symbols[0])
                        dict_article["related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "SeekingAlpha"
                        dict_article["channel_in_site"] = "analysis"
                        dict_article["batch_action_uuid"] = batch_action_uuid

                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        # endregion

        # region symbol news

        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_news.csv"))
        for i, f in enumerate(rlt_files):
            logger.info(f"proecess file : {f} ")
            df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"])
            symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_news(
                df_from_csv)

            for idx, row in df_rlt.iterrows():
                dict_row = row.to_dict()
                dict_article = dict()
                if not pd.isna(dict_row.get("title", np.NaN)):
                    dict_article["title"] = dict_row["title"]
                if not pd.isna(dict_row.get("news_url", np.NaN)):
                    article_url = furl(dict_row["news_url"])
                    abs_url_str = f"{article_url.origin}{article_url.path}"
                    # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况
                    if abs_url_str.find("seekingalpha") > 0:
                        dict_article["full_text_url"] = abs_url_str
                        dict_article["uuid"] = md5_str(abs_url_str)
                    else:
                        dict_article["full_text_url"] = article_url.url
                        dict_article["uuid"] = md5_str(article_url.url)

                if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                    dict_article["publish_time"] = dict_row[
                        "publish_time"].to_pydatetime()
                if not pd.isna(dict_row.get("comments", np.NaN)):
                    dict_article[
                        "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                            comments=dict_row["comments"])
                ls_related_symbols: List[FinancialInstrumentSymbol] = [
                    FinancialInstrumentSymbol(symbol=symbol)
                ]
                if save_doc:
                    upsert_document(ls_related_symbols[0])
                dict_article["related_symbols"] = ls_related_symbols
                dict_article["engine_site"] = "SeekingAlpha"
                dict_article["batch_action_uuid"] = batch_action_uuid
                if not pd.isna(dict_row.get("orig_source", np.NaN)):
                    dict_article["channel_in_site"] = dict_row["orig_source"]

                article = Article(**dict_article)
                rlt_articles.append(article)
                if save_doc:
                    upsert_document(article, True)

        # endregion

        # region symbol info
        rlt_files = glob.glob(os.path.join(rlt_path,
                                           "*_symbol_indicators.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                dict_symbol_info = SeekingAlphaDataProcess.proc_symbol_indicator(
                    df_from_csv)
                if not dict_symbol_info:
                    continue
                if "symbol" not in dict_symbol_info:
                    continue
                symbol = FinancialInstrumentSymbol(
                    symbol=dict_symbol_info.get("symbol"),
                    info_from_seeking_alpha=SymbolInfoBySeekingAlpha(
                        followers=dict_symbol_info.get("followers", None),
                        high_52wk=dict_symbol_info.get("52wk high", None),
                        low_52wk=dict_symbol_info.get("52wk low", None),
                        eps_fwd=dict_symbol_info.get("EPS (FWD)", None),
                        pe_fwd=dict_symbol_info.get("PE (FWD)", None),
                        yield_fwd=dict_symbol_info.get("Yield (FWD)", None),
                        div_rate_fwd=dict_symbol_info.get(
                            "Div Rate (FWD)", None),
                        mkt_cap=dict_symbol_info.get("Market Cap", None),
                        volume=dict_symbol_info.get("Volume", None),
                        mtime=datetime.now()))
                if save_doc:
                    upsert_document(symbol, True)
                rlt_symbols.append(symbol)
        # endregion

        return rlt_symbols, rlt_articles
Beispiel #7
0
    def author_detail(
        rlt_path: str,
        batch_action_uuid: str,
        action_uuid: str,
        save_doc: bool = True
    ) -> Tuple[List[AuthorInSeekingAlpha], List[Article]]:
        rlt_articles: List[Article] = list()
        rlt_authors: List[AuthorInSeekingAlpha] = list()

        # region author articles
        rlt_files = glob.glob(os.path.join(rlt_path, "*_author_articles.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                author_id, df_rlt = SeekingAlphaDataProcess.proc_author_articles(
                    df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("title", np.NaN)):
                            dict_article["title"] = dict_row["title"]
                        if not pd.isna(dict_row.get("news_url", np.NaN)):
                            article_url = furl(dict_row["news_url"])
                            abs_url_str = f"{article_url.origin}{article_url.path}"
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "publish_time"].to_pydatetime()
                        if not pd.isna(dict_row.get("comments", np.NaN)):
                            dict_article[
                                "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                    comments=dict_row["comments"])
                        if not pd.isna(dict_row.get("symbols", np.NaN)):
                            ls_related_symbols: List[
                                FinancialInstrumentSymbol] = [
                                    FinancialInstrumentSymbol(symbol=s.strip())
                                    for s in dict_row["symbols"].split(",")
                                ]
                            if ls_related_symbols:
                                for symbol in ls_related_symbols:
                                    if save_doc:
                                        upsert_document(symbol, True)
                                dict_article[
                                    "related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "SeekingAlpha"
                        dict_article["batch_action_uuid"] = batch_action_uuid
                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        # endregion

        # region author info
        rlt_files = glob.glob(os.path.join(rlt_path, "*_author_info.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                dict_author_info = SeekingAlphaDataProcess.proc_author_info(
                    df_from_csv)
                if not dict_author_info:
                    continue
                if "author" not in dict_author_info:
                    continue
                author = AuthorInSeekingAlpha(
                    author_id=dict_author_info.get("author"),
                    intro=dict_author_info.get("author_intro", ""),
                    articles=dict_author_info.get("articles", None),
                    picks=dict_author_info.get("authors_picks", None),
                    blog_posts=dict_author_info.get("instablogs", None),
                    comments=dict_author_info.get("comments", None),
                    stock_talks=dict_author_info.get("stocktalks", None),
                    likes=dict_author_info.get("likes", None),
                    followers=dict_author_info.get("followers", None),
                    following=dict_author_info.get("following", None),
                    mtime=datetime.now(),
                    batch_action_uuid=batch_action_uuid)
                if save_doc:
                    upsert_document(author, True)
                rlt_authors.append(author)
        # endregion

        return rlt_authors, rlt_articles
Beispiel #8
0
    def column_articles(rlt_path: str,
                        batch_action_uuid: str,
                        action_uuid: str,
                        save_doc: bool = True) -> List[Article]:
        rlt_articles = list()

        # region articles
        rlt_files = glob.glob(os.path.join(rlt_path, "*_articles.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                kw, df_rlt = SeekingAlphaDataProcess.proc_article_data(
                    df_from_csv)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()  # for Article
                    if not pd.isna(dict_row.get("title", np.NaN)):
                        dict_article["title"] = dict_row["title"]
                    if not pd.isna(dict_row.get("author_url", np.NaN)):
                        author_url: str = dict_row["author_url"]
                        author_id = author_url.split("/")[-1]
                        dict_article[
                            "seeking_alpha_author"] = AuthorInSeekingAlpha(
                                author_id=author_id, url=author_url)
                    if not pd.isna(dict_row.get("article_url", np.NaN)):
                        article_url = furl(dict_row["article_url"])
                        abs_url_str = f"{article_url.origin}{article_url.path}"
                        dict_article["full_text_url"] = abs_url_str
                        dict_article["uuid"] = md5_str(abs_url_str)
                    if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row[
                            "publish_time"].to_pydatetime()
                    dict_article["engine_site"] = "SeekingAlpha"
                    ls_related_symbols: List[FinancialInstrumentSymbol] = list(
                    )
                    for symbol_key_pair in [
                        ("related_symbol1", "related_symbol1_fullname"),
                        ("related_symbol2", "related_symbol2_fullname"),
                        ("related_symbol3", "related_symbol3_fullname")
                    ]:
                        if not pd.isna(dict_row.get(
                                symbol_key_pair[0], np.NaN)) and not pd.isna(
                                    dict_row.get(symbol_key_pair[1], np.NaN)):
                            fin_instrument_symbol = FinancialInstrumentSymbol(
                                symbol=dict_row[symbol_key_pair[0]],
                                full_name=dict_row[symbol_key_pair[1]],
                                batch_action_uuid=batch_action_uuid)
                            ls_related_symbols.append(fin_instrument_symbol)
                            # ListField(ReferenceField(FinancialInstrumentSymbol)) 似乎不会级联保存,这里创建的时候同时保存
                            if save_doc:
                                upsert_document(fin_instrument_symbol, True)
                    if ls_related_symbols:
                        dict_article["related_symbols"] = ls_related_symbols

                    if not pd.isna(dict_row.get("comments", np.NaN)):
                        dict_article[
                            "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                comments=dict_row["comments"])

                    dict_article["channel_in_site"] = kw
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)
        # endregion

        return rlt_articles