Exemple #1
0
    def kw_search(rlt_path: str,
                  batch_action_uuid: str,
                  action_uuid: str,
                  save_doc: bool = True) -> List[Article]:
        rlt_articles: List[Article] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_kw_search_result.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                kw, df_rlt = SeekingAlphaDataProcess.proc_kw_search(
                    df_from_csv)
                # print(df_rlt)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()
                    if not pd.isna(dict_row.get("title", np.NaN)):
                        dict_article["title"] = dict_row["title"]
                    if not pd.isna(dict_row.get("news_url", np.NaN)):
                        article_url = furl(dict_row["news_url"])
                        abs_url_str = f"{article_url.origin}{article_url.path}"
                        # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况
                        if abs_url_str.find("seekingalpha") > 0:
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        else:
                            dict_article["full_text_url"] = article_url.url
                            dict_article["uuid"] = md5_str(article_url.url)
                    if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row[
                            "publish_time"].to_pydatetime()
                    if not pd.isna(dict_row.get("symbols", np.NaN)):
                        symbols = [
                            x.strip()
                            for x in dict_row.get("symbols").split(",")
                        ]
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=x)
                            for x in symbols
                        ]
                        if save_doc:
                            map(upsert_document, ls_related_symbols)
                        dict_article["related_symbols"] = ls_related_symbols
                    dict_article["from_searching_phase"] = SearchingPhrase(
                        searching_phrase=kw)
                    dict_article["engine_site"] = "SeekingAlpha"
                    dict_article["channel_in_site"] = "Search"
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)

        return rlt_articles
Exemple #2
0
    def __init__(self, experiment_name: str, template_workflow_yml_file: str,
                 trial_params: Mapping, trial_uuid: str, experiment_uuid: str):
        super().__init__()
        self.workflow_cfg, self.workflow_context = load_mapping_from_file(
            template_workflow_yml_file)
        self.trial_params = trial_params
        for k, v in trial_params.items():
            changed_items = upsert_step_cfg_para(self.workflow_cfg,
                                                 self.workflow_context, k, v)
            if changed_items == 0:
                logger.error(
                    f"'{k}' is not available in config setting , should check search space file"
                )
        self.experiment_name = experiment_name
        self.trial_uuid = trial_uuid
        self.experiment_uuid = experiment_uuid

        self.is_trial_finished = False
        self.cfg_hash = md5_str(
            HashCalculation.value_to_hash_str(self.workflow_cfg))

        self._trial_finished_future = asyncio.get_event_loop().create_future()

        # ---- [laigen 2020.02.29] colab train 不再通过 kafka 进行,这里先去掉。  ----
        # colab_side_env = TrialColabSideEnv(self.cfg_hash, self.trial_uuid)
        # self._colab_side_env_pk = colab_side_env.pk
        # ---- end ----
        self.metrics_reporter = TrailMetricsArcticReporter(
            self.experiment_name, self.experiment_uuid, self.trial_uuid)
        self.latest_epoch = None
        self.final_val = None
Exemple #3
0
def upsert_yahoo_recommend(symbol: str):
    yahoo_ts = yFinanceData()
    try:
        df = yahoo_ts.recommendations(symbol_to_yahoo_symbol(symbol))
        logger.info(f"upsert_yahoo_recommend : {symbol}-{df.shape}")
        for t, row in df.iterrows():
            firm = GlobalEntity(entity_id=row["Firm"])
            dict_info = {
                "t":
                t,
                "symbol":
                FinancialInstrumentSymbol(symbol=symbol),
                "firm":
                firm,
                "uid":
                md5_str(f"{t.isoformat()}-{symbol}-{row['Firm']}"),
                "to_grade":
                None if row["To Grade"] == "" else row["To Grade"],
                "from_grade":
                None if row["From Grade"] == "" else row["From Grade"],
                "action":
                None if row["Action"] == "" else row["Action"]
            }
            upsert_document(firm)
            upsert_document(FinancialInstrumentRecommendInYahoo(**dict_info),
                            False)
    except Exception as ex:
        logger.error(ex)
        return
Exemple #4
0
def upsert_splits(symbol: str, start_t: datetime):
    yahoo_ts = yFinanceData()
    try:
        df = yahoo_ts.splits(symbol_to_yahoo_symbol(symbol), start=start_t)
        logger.info(f"upsert_splits : {symbol}-{df.shape}")
        for t, row in df.iterrows():
            dict_info = {
                "t": t,
                "split": row["Stock Splits"],
                "symbol": FinancialInstrumentSymbol(symbol=symbol),
                "uid": md5_str(f"{t.isoformat()}-{symbol}")
            }
            upsert_document(FinancialInstrumentDividends(**dict_info))
    except:
        return
Exemple #5
0
    def kw_news_search(rlt_path: str, batch_action_uuid: str, action_uuid: str, save_doc: bool = True) -> List[Article]:
        rlt_articles: List[Article] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_index_items.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                try:
                    df_from_csv = pd.read_csv(f, header=0, parse_dates=["publish_time"])
                except:
                    continue
                kw, df_rlt = GoogleNewsSearchProcess.proc_news_search_data(df_from_csv)
                if df_rlt is None:
                    continue
                logger.info(f"proecess file : {f} - {df_rlt.shape}")
                # print(df_rlt)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()
                    if not pd.isna(dict_row.get("news_title", np.NaN)):
                        dict_article["title"] = dict_row["news_title"]
                    if not pd.isna(dict_row.get("url", np.NaN)):
                        dict_article["full_text_url"] = dict_row["url"]
                    if not pd.isna(dict_row.get("news_abstract", np.NaN)):
                        dict_article["abstract"] = dict_row["news_abstract"]
                    if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row["publish_time"].to_pydatetime()

                    search_phrase_in_db = SearchingPhrase.objects(searching_phrase=kw).first()
                    if search_phrase_in_db is not None:
                        dict_article["from_searching_phase"] = search_phrase_in_db
                        if search_phrase_in_db.related_symbols is not None:
                            dict_article["related_symbols"] = search_phrase_in_db.related_symbols
                    else:
                        dict_article["from_searching_phase"] = SearchingPhrase(searching_phrase=kw)

                    dict_article["engine_site"] = "google_news"
                    if not pd.isna(dict_row.get("publisher", np.NaN)):
                        dict_article["channel_in_site"] = dict_row["publisher"]
                        dict_article["uuid"] = md5_str(f"{dict_article['channel_in_site']}-{dict_article['title']}")
                    else:  # 暂定没有 publisher 的 news 不入库
                        continue
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)

        return rlt_articles
    def __init__(self, model_name: str, model_inst_gid: str,
                 template_workflow_yml_file: str, changed_params: Mapping,
                 pred_name: str):
        super().__init__()
        self._model_name = model_name
        self._model_inst_gid = model_inst_gid
        self._pred_name = pred_name
        self.workflow_cfg, self.workflow_context = load_mapping_from_file(
            template_workflow_yml_file)

        for k, v in changed_params.items():
            changed_items = upsert_step_cfg_para(self.workflow_cfg,
                                                 self.workflow_context, k, v)
            if changed_items == 0:
                logger.error(
                    f"'{k}' is not available in config setting , should check changed_params file"
                )
        self.cfg_hash = md5_str(
            HashCalculation.value_to_hash_str(self.workflow_cfg))
Exemple #7
0
def upsert_yahoo_symbol_holder(symbol: str):
    import yfinance as yf
    symbol_data = yf.Ticker(symbol_to_yahoo_symbol(symbol))
    try:
        df = symbol_data.institutional_holders
        logger.info(f"upsert_yahoo_symbol_holder : {symbol} {df.shape}")
        for idx, row in df.iterrows():
            t = row["Date Reported"]
            dict_info = {
                "t": t,
                "symbol": FinancialInstrumentSymbol(symbol=symbol),
                "holder": GlobalEntity(entity_id=row["Holder"]),
                "shares": row["Shares"],
                "value": row["Value"],
                "percentage_out": row["% Out"],
                "uid": md5_str(f"{t.isoformat()}-{symbol}-{row['Holder']}")
            }
            upsert_document(FinancialInstrumentHolders(**dict_info), True)
    except Exception as ex:
        logger.error(f"api exception when get data , {ex}")
Exemple #8
0
def upsert_daily_market_data(symbol: str, start_t: datetime):
    yahoo_ts = yFinanceData()
    df = yahoo_ts.history(symbol_to_yahoo_symbol(symbol), start=start_t)
    logger.info(f"upsert_daily_market_data : {symbol}-{df.shape}")
    df["fifty_two_week_high"] = df["Close"].rolling(window=244).max()
    df["fifty_two_week_low"] = df["Close"].rolling(window=244).min()

    for t, row in df.iterrows():
        dict_info = {
            "t": t,
            "open": row["Open"],
            "high": row["High"],
            "low": row["Low"],
            "close": row["Close"],
            "volume": row["Volume"],
            "dividends": row["Dividends"],
            "splits": row["Stock Splits"],
            "fifty_two_week_low": row["fifty_two_week_low"],
            "fifty_two_week_high": row["fifty_two_week_high"],
            "symbol": FinancialInstrumentSymbol(symbol=symbol),
            "uid": md5_str(f"{t.isoformat()}-{symbol}")
        }
        upsert_document(FinancialInstrumentDailyMarketData(**dict_info))
Exemple #9
0
def upsert_yahoo_earning_analysis(symbol: str):
    """see https://finance.yahoo.com/quote/MSFT/analysis?p=MSFT"""
    import yfinance as yf
    symbol_data = yf.Ticker(symbol_to_yahoo_symbol(symbol))
    try:
        df = symbol_data.calendar.T
        logger.info(f"upsert_yahoo_earning_analysis : {symbol}  {df.shape}")
        for idx, row in df.iterrows():
            t = row["Earnings Date"]
            dict_info = {
                "t": t,
                "symbol": FinancialInstrumentSymbol(symbol=symbol),
                "earnings_average": _convert_non_t_v(row["Earnings Average"]),
                "earnings_low": _convert_non_t_v(row["Earnings Low"]),
                "earnings_high": _convert_non_t_v(row["Earnings High"]),
                "revenue_average": _convert_non_t_v(row["Revenue Average"]),
                "revenue_low": _convert_non_t_v(row["Revenue Low"]),
                "revenue_high": _convert_non_t_v(row["Revenue High"]),
                "uid": md5_str(f"{t.isoformat()}-{symbol}")
            }
            upsert_document(FinancialInstrumentCalendarFromYahoo(**dict_info),
                            True)
    except Exception as ex:
        logger.error(f"api exception when get data , {ex}")
Exemple #10
0
    def kw_search(
            rlt_path: str,
            batch_action_uuid: str,
            action_uuid: str,
            save_doc: bool = True
    ) -> Tuple[List[Article], List[UserInTwitter]]:
        rlt_articles: List[Article] = list()
        rlt_posters: List[UserInTwitter] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_posts.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(
                    f, header=0, parse_dates=["post_time", "extract_t"])
                kw, df_rlt = TwitterDataProcess.proc_posts(df_from_csv)
                # print(kw)
                # print(df_rlt)

                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()
                    if not pd.isna(dict_row.get("post_content", np.NaN)):
                        dict_article["title"] = dict_row["post_content"]
                    if not pd.isna(dict_row.get("post_content_detail",
                                                np.NaN)):
                        dict_article["abstract"] = dict_row[
                            "post_content_detail"]
                    if not pd.isna(dict_row.get("post_additional_url",
                                                np.NaN)):
                        dict_article["full_text_url"] = dict_row[
                            "post_additional_url"]
                    if not pd.isna(dict_row.get("post_image_url", np.NaN)):
                        dict_article["related_image_url"] = dict_row[
                            "post_image_url"]
                    if dict_row.get("post_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row[
                            "post_time"].to_pydatetime()
                    if not pd.isna(dict_row.get("poster_name", np.NaN)):
                        poster = UserInTwitter(user_id=dict_row["poster_id"],
                                               name=dict_row["poster_name"])
                    else:
                        poster = UserInTwitter(user_id=dict_row["poster_id"])
                    dict_article["twitter_poster"] = poster
                    # uuid 的计算规则为 posterid + post_time
                    dict_article["uuid"] = md5_str(
                        f"{poster.user_id}|{dict_article['publish_time'].isoformat()}"
                    )
                    if not pd.isna(dict_row.get(
                            "comments", np.NaN)) or not pd.isna(
                                dict_row.get(
                                    "retweet", np.NaN)) or not pd.isna(
                                        dict_row.get("retweet", np.NaN)):
                        extra_data = TweetExtra()
                        if not pd.isna(dict_row.get("comments", np.NaN)):
                            extra_data.comments = int(dict_row["comments"])
                        if not pd.isna(dict_row.get("retweet", np.NaN)):
                            extra_data.retweet = int(dict_row["retweet"])
                        if not pd.isna(dict_row.get("like", np.NaN)):
                            extra_data.like = int(dict_row["like"])
                        dict_article["tweet_extra"] = extra_data

                    search_phrase_in_db = SearchingPhrase.objects(
                        searching_phrase=kw).first()
                    if search_phrase_in_db is not None:
                        dict_article[
                            "from_searching_phase"] = search_phrase_in_db
                        if search_phrase_in_db.related_symbols is not None:
                            dict_article[
                                "related_symbols"] = search_phrase_in_db.related_symbols
                    else:
                        dict_article["from_searching_phase"] = SearchingPhrase(
                            searching_phrase=kw)
                    dict_article["engine_site"] = "Twitter"
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)

        rlt_files = glob.glob(
            os.path.join(rlt_path, "*_follower_following.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                t, kw, df_rlt = TwitterDataProcess.proc_follower_following_info(
                    df_from_csv, "search_phase")
                # print(kw)
                # print(df_rlt)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    twitter_user = UserInTwitter(user_id=dict_row["poster_id"])
                    if not pd.isna(dict_row.get("following", np.NaN)):
                        twitter_user.following = int(dict_row["following"])
                    if not pd.isna(dict_row.get("follower", np.NaN)):
                        twitter_user.follower = int(dict_row["follower"])
                    twitter_user.mtime = t
                    twitter_user.batch_action_uuid = batch_action_uuid
                    rlt_posters.append(twitter_user)
                    if save_doc:
                        upsert_document(twitter_user, True)
        return rlt_articles, rlt_posters
Exemple #11
0
    def symbol_analysis_report(rlt_path: str,
                               batch_action_uuid: str,
                               action_uuid: str,
                               save_doc: bool = True) -> List[Article]:

        rlt_articles: List[Article] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                df_rlt = EastMoneyDataProcess.proc_stock_analysis(df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("article", np.NaN)):
                            dict_article["title"] = dict_row["article"]
                        if not pd.isna(dict_row.get("article_url", np.NaN)):
                            dict_article["full_text_url"] = dict_row[
                                "article_url"]
                            dict_article["uuid"] = md5_str(
                                dict_row["article_url"])

                        if not pd.isna(dict_row.get("org_url", np.NaN)):
                            # 暂时先把机构名直接存在 seeking alpha 的 author 数据中,这样画图方便一些
                            author_url: str = dict_row["org_url"]
                            author_id = dict_row["org"]
                            # NO author_name extracted!!
                            # author_name = None
                            # if not pd.isna(dict_row.get("author", np.NaN)):
                            #     author_name = dict_row["author"]
                            author = AuthorInSeekingAlpha(author_id=author_id,
                                                          url=author_url)
                            if not pd.isna(
                                    dict_row.get("reports_within_one_month",
                                                 np.NaN)):
                                author.articles = dict_row[
                                    "reports_within_one_month"]

                            dict_article["seeking_alpha_author"] = author
                            if save_doc:
                                upsert_document(author, True)
                        if not pd.isna(dict_row.get("rating", np.NaN)):
                            dict_article["rating"] = dict_row["rating"]
                        if not pd.isna(dict_row.get("rating_chg", np.NaN)):
                            dict_article["rating_change"] = dict_row[
                                "rating_chg"]
                        if not pd.isna(dict_row.get("pred_2020_ret", np.NaN)):
                            dict_article["pred_ret_this_yr"] = dict_row[
                                "pred_2020_ret"]
                        if not pd.isna(dict_row.get("pred_2020_pe", np.NaN)):
                            dict_article["pred_pe_this_yr"] = dict_row[
                                "pred_2020_pe"]
                        if not pd.isna(dict_row.get("pred_2021_pe", np.NaN)):
                            dict_article["pred_pe_next_yr"] = dict_row[
                                "pred_2021_pe"]
                        if not pd.isna(dict_row.get("pred_2021_ret", np.NaN)):
                            dict_article["pred_ret_next_yr"] = dict_row[
                                "pred_2021_ret"]
                        if dict_row.get("report_date", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "report_date"].to_pydatetime()

                        symbol = dict_row["symbol"]
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=symbol)
                        ]
                        if save_doc:
                            ls_related_symbols[0].full_name = dict_row[
                                "symbol_name"]
                            upsert_document(ls_related_symbols[0])
                        dict_article["related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "EastMoney"
                        dict_article["channel_in_site"] = "StockAnalysis"
                        dict_article["batch_action_uuid"] = batch_action_uuid

                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        return rlt_articles
 def get_hash_str(self) -> str:
     hp_dict = self.get_init_value_dict(True)
     return md5_str(HashCalculation.value_to_hash_str(hp_dict))
Exemple #13
0
    def author_detail(
        rlt_path: str,
        batch_action_uuid: str,
        action_uuid: str,
        save_doc: bool = True
    ) -> Tuple[List[AuthorInSeekingAlpha], List[Article]]:
        rlt_articles: List[Article] = list()
        rlt_authors: List[AuthorInSeekingAlpha] = list()

        # region author articles
        rlt_files = glob.glob(os.path.join(rlt_path, "*_author_articles.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                author_id, df_rlt = SeekingAlphaDataProcess.proc_author_articles(
                    df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("title", np.NaN)):
                            dict_article["title"] = dict_row["title"]
                        if not pd.isna(dict_row.get("news_url", np.NaN)):
                            article_url = furl(dict_row["news_url"])
                            abs_url_str = f"{article_url.origin}{article_url.path}"
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "publish_time"].to_pydatetime()
                        if not pd.isna(dict_row.get("comments", np.NaN)):
                            dict_article[
                                "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                    comments=dict_row["comments"])
                        if not pd.isna(dict_row.get("symbols", np.NaN)):
                            ls_related_symbols: List[
                                FinancialInstrumentSymbol] = [
                                    FinancialInstrumentSymbol(symbol=s.strip())
                                    for s in dict_row["symbols"].split(",")
                                ]
                            if ls_related_symbols:
                                for symbol in ls_related_symbols:
                                    if save_doc:
                                        upsert_document(symbol, True)
                                dict_article[
                                    "related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "SeekingAlpha"
                        dict_article["batch_action_uuid"] = batch_action_uuid
                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        # endregion

        # region author info
        rlt_files = glob.glob(os.path.join(rlt_path, "*_author_info.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                dict_author_info = SeekingAlphaDataProcess.proc_author_info(
                    df_from_csv)
                if not dict_author_info:
                    continue
                if "author" not in dict_author_info:
                    continue
                author = AuthorInSeekingAlpha(
                    author_id=dict_author_info.get("author"),
                    intro=dict_author_info.get("author_intro", ""),
                    articles=dict_author_info.get("articles", None),
                    picks=dict_author_info.get("authors_picks", None),
                    blog_posts=dict_author_info.get("instablogs", None),
                    comments=dict_author_info.get("comments", None),
                    stock_talks=dict_author_info.get("stocktalks", None),
                    likes=dict_author_info.get("likes", None),
                    followers=dict_author_info.get("followers", None),
                    following=dict_author_info.get("following", None),
                    mtime=datetime.now(),
                    batch_action_uuid=batch_action_uuid)
                if save_doc:
                    upsert_document(author, True)
                rlt_authors.append(author)
        # endregion

        return rlt_authors, rlt_articles
Exemple #14
0
    def symbol_summary(
        rlt_path: str,
        batch_action_uuid: str,
        action_uuid: str,
        save_doc: bool = True
    ) -> Tuple[List[FinancialInstrumentSymbol], List[Article]]:
        rlt_articles: List[Article] = list()
        rlt_symbols: List[FinancialInstrumentSymbol] = list()

        # region symbol analysis
        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_analysis(
                    df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("title", np.NaN)):
                            dict_article["title"] = dict_row["title"]
                        if not pd.isna(dict_row.get("article_url", np.NaN)):
                            article_url = furl(dict_row["article_url"])
                            abs_url_str = f"{article_url.origin}{article_url.path}"
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        if not pd.isna(dict_row.get("author_url", np.NaN)):
                            author_url: str = dict_row["author_url"]
                            author_id = dict_row["author_id"]
                            # NO author_name extracted!!
                            # author_name = None
                            # if not pd.isna(dict_row.get("author", np.NaN)):
                            #     author_name = dict_row["author"]
                            author = AuthorInSeekingAlpha(author_id=author_id,
                                                          url=author_url)
                            dict_article["seeking_alpha_author"] = author
                            if save_doc:
                                upsert_document(author, True)
                        if not pd.isna(dict_row.get("rating", np.NaN)):
                            dict_article["rating"] = dict_row["rating"]

                        if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "publish_time"].to_pydatetime()
                        if not pd.isna(dict_row.get("comments", np.NaN)):
                            dict_article[
                                "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                    comments=dict_row["comments"])
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=symbol)
                        ]
                        if save_doc:
                            upsert_document(ls_related_symbols[0])
                        dict_article["related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "SeekingAlpha"
                        dict_article["channel_in_site"] = "analysis"
                        dict_article["batch_action_uuid"] = batch_action_uuid

                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        # endregion

        # region symbol news

        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_news.csv"))
        for i, f in enumerate(rlt_files):
            logger.info(f"proecess file : {f} ")
            df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"])
            symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_news(
                df_from_csv)

            for idx, row in df_rlt.iterrows():
                dict_row = row.to_dict()
                dict_article = dict()
                if not pd.isna(dict_row.get("title", np.NaN)):
                    dict_article["title"] = dict_row["title"]
                if not pd.isna(dict_row.get("news_url", np.NaN)):
                    article_url = furl(dict_row["news_url"])
                    abs_url_str = f"{article_url.origin}{article_url.path}"
                    # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况
                    if abs_url_str.find("seekingalpha") > 0:
                        dict_article["full_text_url"] = abs_url_str
                        dict_article["uuid"] = md5_str(abs_url_str)
                    else:
                        dict_article["full_text_url"] = article_url.url
                        dict_article["uuid"] = md5_str(article_url.url)

                if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                    dict_article["publish_time"] = dict_row[
                        "publish_time"].to_pydatetime()
                if not pd.isna(dict_row.get("comments", np.NaN)):
                    dict_article[
                        "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                            comments=dict_row["comments"])
                ls_related_symbols: List[FinancialInstrumentSymbol] = [
                    FinancialInstrumentSymbol(symbol=symbol)
                ]
                if save_doc:
                    upsert_document(ls_related_symbols[0])
                dict_article["related_symbols"] = ls_related_symbols
                dict_article["engine_site"] = "SeekingAlpha"
                dict_article["batch_action_uuid"] = batch_action_uuid
                if not pd.isna(dict_row.get("orig_source", np.NaN)):
                    dict_article["channel_in_site"] = dict_row["orig_source"]

                article = Article(**dict_article)
                rlt_articles.append(article)
                if save_doc:
                    upsert_document(article, True)

        # endregion

        # region symbol info
        rlt_files = glob.glob(os.path.join(rlt_path,
                                           "*_symbol_indicators.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                dict_symbol_info = SeekingAlphaDataProcess.proc_symbol_indicator(
                    df_from_csv)
                if not dict_symbol_info:
                    continue
                if "symbol" not in dict_symbol_info:
                    continue
                symbol = FinancialInstrumentSymbol(
                    symbol=dict_symbol_info.get("symbol"),
                    info_from_seeking_alpha=SymbolInfoBySeekingAlpha(
                        followers=dict_symbol_info.get("followers", None),
                        high_52wk=dict_symbol_info.get("52wk high", None),
                        low_52wk=dict_symbol_info.get("52wk low", None),
                        eps_fwd=dict_symbol_info.get("EPS (FWD)", None),
                        pe_fwd=dict_symbol_info.get("PE (FWD)", None),
                        yield_fwd=dict_symbol_info.get("Yield (FWD)", None),
                        div_rate_fwd=dict_symbol_info.get(
                            "Div Rate (FWD)", None),
                        mkt_cap=dict_symbol_info.get("Market Cap", None),
                        volume=dict_symbol_info.get("Volume", None),
                        mtime=datetime.now()))
                if save_doc:
                    upsert_document(symbol, True)
                rlt_symbols.append(symbol)
        # endregion

        return rlt_symbols, rlt_articles
Exemple #15
0
def create_equity_workflow(req: WorkflowRequest):
    assert req.workflow_name in GSPredefinedWorkflow._value2member_map_

    equity_entity = find_equity(req.entity_str)
    if equity_entity is None:
        wf_batch_uuid = md5_str(
            f"{req.request_from_account}-{req.ctime.isoformat()}-{req.entity_str}"
        )
        doc_wf = TriggeredWebPagesCrawlWorkflow(
            uuid=wf_batch_uuid,
            workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
            para_begin=req.para_begin,
            para_end=req.para_begin,
            submit_account=req.request_from_account,
            submit_type=WorkflowSubmitType.HotKey.value,
            submit_time=req.ctime,
            finish_or_error_flag=WorkflowStatusFlag.WithError.value,
            error_msg=f"Can't find equity symbol or name by '{req.entity_str}'"
        )
        upsert_document(doc_wf, False)
        return

    # 找到了 entity, 生成 workflow 的内容
    wf_batch_uuid = md5_str(
        f"{equity_entity.symbol}-{req.workflow_name}-{req.para_begin}-{req.para_end}-{req.request_from_account}-{req.ctime.isoformat()}"
    )

    # 查询 workflow 预设的更新频率
    # wf_freq = "D"
    wf_freq = "1s"
    workflow_def = PredefinedWorkflow.objects(
        workflow_name=req.workflow_name).first()
    if workflow_def is not None:
        wf_freq = workflow_def.refresh_freq
    # 找一下该 symbol 的 workflow 最近一次的执行时间(假定 Per Symbol + Per Account)
    latest_workflow_inst = TriggeredWebPagesCrawlWorkflow.objects(
        fin_instrument=equity_entity.symbol,
        workflow=req.workflow_name,
        submit_account=req.request_from_account,
        finish_or_error_flag__in=[
            WorkflowStatusFlag.WaitToRun.value,
            WorkflowStatusFlag.SuccessFinished.value
        ]).order_by("-submit_time").first()
    # 如果在同一个周期的,直接记录一条错误的记录内容
    if latest_workflow_inst is not None and is_same_period(
            latest_workflow_inst.submit_time, req.ctime, wf_freq):
        logger.error(
            f"Workflow(uuid={latest_workflow_inst.uuid},ctime='{latest_workflow_inst.submit_time}') in the same period is existed."
        )
        doc_wf = TriggeredWebPagesCrawlWorkflow(
            uuid=wf_batch_uuid,
            main_entity_type=EntityType.Equity.value,
            fin_instrument=FinancialInstrumentSymbol(
                symbol=equity_entity.symbol),
            workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
            para_begin=req.para_begin,
            para_end=req.para_begin,
            submit_account=req.request_from_account,
            submit_type=WorkflowSubmitType.HotKey.value,
            submit_time=req.ctime,
            finish_or_error_flag=WorkflowStatusFlag.WithError.value,
            error_msg=
            f"workflow '{req.workflow_name}'({equity_entity.symbol}) is executed at {latest_workflow_inst.submit_time} . No need to rerun now."
        )
        upsert_document(doc_wf, False)
        return

    # 创建一个workflow
    doc_wf = TriggeredWebPagesCrawlWorkflow(
        uuid=wf_batch_uuid,
        main_entity_type=EntityType.Equity.value,
        fin_instrument=FinancialInstrumentSymbol(symbol=equity_entity.symbol),
        workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
        para_begin=req.para_begin,
        para_end=req.para_begin,
        submit_account=req.request_from_account,
        submit_type=WorkflowSubmitType.HotKey.value,
        submit_time=req.ctime,
        finish_or_error_flag=WorkflowStatusFlag.WaitToRun.value)
    upsert_document(doc_wf, False)

    # 创建 batch action
    doc_batch_action = RPABatchAction(
        batch_id=wf_batch_uuid,
        is_dynamic_batch=True,
        from_function=cls_to_str(create_equity_workflow),
        ctime=req.ctime,
        status=ActionStatusFlag.WaitingForRun.value)
    upsert_document(doc_batch_action, False)

    # 依次调用 action generator 函数
    # NOTE : 这里是直接访问 diction , 以后改为调用函数,就可以支持 register 的功能
    for func in WORKFLOW_NAME_TO_ACTION_GENERATORS.get(req.workflow_name, []):
        func(equity_entity, wf_batch_uuid)
    logger.info(f"Batch action '{wf_batch_uuid}' is created.")
def upsert_tag_combination(tags: List[str]):
    kw_comb = KWCombinationPattern(tags_comb_hash=md5_str("".join(sorted(tags))),
                                   tags_value="+".join(tags),
                                   tags=[Tag(name=n) for n in tags])
    kw_comb.save()
    def process_action_result(rlt_path: str,
                              batch_action_uuid: str,
                              action_uuid: str,
                              save_doc: bool = True):
        dict_extract_info: Dict[str, str] = {}

        extract_info_files = glob.glob(
            os.path.join(rlt_path, "*_extract_info.js"))
        if extract_info_files:
            with open(extract_info_files[0], "r") as f:
                dict_extract_info = json.load(f)

        article_files = glob.glob(os.path.join(rlt_path, "*_articles.csv"))
        ls_all_article_uuid: List[str] = list()
        """保存所有的 uuid ,用于到 Mongo 中查询哪些是已经 Insert 的"""
        ls_all_article_doc_dict: List[Dict] = list()
        """先保存到一个 list , 然后统一 upsert document """

        if article_files:
            df_articles = pd.read_csv(article_files[0], header=0)
            for idx, row in df_articles.iterrows():
                dict_row = row.to_dict()
                dict_article = dict()
                # "title" = > Article.title , 一般不做特殊处理,有则填入
                if not pd.isna(dict_row.get("title", np.NaN)):
                    dict_article["title"] = dict_row["title"]
                # "abstract" = > Article.abstract, 摘要,一般不做特殊处理
                if not pd.isna(dict_row.get("abstract", np.NaN)):
                    dict_article["abstract"] = dict_row["abstract"]
                # "channel_in_site" = > Article.channel_in_site, 二级分类,站点内的
                if not pd.isna(dict_row.get("channel_in_site", np.NaN)):
                    dict_article["channel_in_site"] = dict_row[
                        "channel_in_site"]
                # "full_text_url" = > Article.full_text_url , 正文的链接,需要做地址转换
                if not pd.isna(dict_row.get("full_text_url", np.NaN)):
                    full_text_url = dict_row["full_text_url"]
                    # 尝试拼接 site
                    page_url = dict_extract_info.get("real_url", "")
                    if page_url:
                        full_text_url = append_site_to_url(
                            full_text_url,
                            furl(page_url).origin)
                    dict_article["full_text_url"] = full_text_url
                # "publish_time" = > Article.publish_time, 可能需要有数据预处理
                if not pd.isna(dict_row.get("publish_time", np.NaN)):
                    s_time = dict_row["publish_time"]
                    s_time_preprocess = dict_extract_info.get(
                        "publish_time_preprocess", "")
                    if s_time_preprocess:
                        assert s_time_preprocess in TIME_STR_PREPROCESS
                        s_time = TIME_STR_PREPROCESS[s_time_preprocess](s_time)
                    dt_time = _parse_date(s_time)
                    if dt_time:
                        dict_article["publish_time"] = dt_time
                if "article_site" in dict_extract_info:
                    dict_article["engine_site"] = dict_extract_info[
                        "article_site"]
                # 生成 uuid
                if "full_text_url" in dict_article:  # 优先考虑 url 作为 uuid
                    dict_article["uuid"] = md5_str(
                        dict_article["full_text_url"])
                elif "title" in dict_article and "engine_site" in dict_article:  # 标题+site作为 uuid
                    # note: publish_time 不能计算 MD5 , 因为一些比较模糊的字符串,比如 5h 在不同的时间解析,会得到不一致的内容
                    dict_article["uuid"] = md5_str(
                        f"{dict_article['engine_site']}-{dict_article['title']}"
                    )
                else:
                    continue
                dict_article["action_uuid"] = action_uuid
                dict_article["batch_action_uuid"] = batch_action_uuid
                ls_all_article_uuid.append(dict_article["uuid"])
                ls_all_article_doc_dict.append(dict_article)

        # TODO: 查询数据库中是否存在该项内容,用于标记 new / existed
        all_exist_docs = Article.objects(
            uuid__in=ls_all_article_uuid).only("uuid")
        set_existed_uuid = set([doc.uuid for doc in all_exist_docs])

        # 更新 db
        related_articles: List[Article] = list()
        new_found_articles: List[Article] = list()
        for article_doc_dict in ls_all_article_doc_dict:
            related_articles.append(Article(uuid=article_doc_dict["uuid"]))
            if article_doc_dict["uuid"] in set_existed_uuid:
                logger.info(
                    f"Doc '{article_doc_dict['uuid']}' is existed already.")
                continue
            new_found_articles.append(Article(uuid=article_doc_dict["uuid"]))
            if save_doc:
                upsert_document(Article(**article_doc_dict), True)
        logger.info(
            f"Action '{action_uuid}' found {len(new_found_articles)} new articles"
        )

        # 存 img 和 pdf
        pdf_screen_files = glob.glob(
            os.path.join(rlt_path, "*_webpage_content.pdf"))
        orig_pdf_bin_uuid: str = None
        if pdf_screen_files:
            with open(pdf_screen_files[0], "rb") as f:
                bin_pdf = f.read()
            bin_pdf_gz = gzip.compress(bin_pdf)
            orig_pdf_bin_uuid = md5_binary(bin_pdf)
            if save_doc:
                pdf_attach = BinaryAttachment(uuid=orig_pdf_bin_uuid,
                                              bin_data=bin_pdf_gz,
                                              file_ext="pdf",
                                              is_gzip=True,
                                              bin_length=len(bin_pdf_gz),
                                              ctime=datetime.now(),
                                              action_uuid=action_uuid)
                upsert_document(pdf_attach, False)

        png_screenshot_files = glob.glob(
            os.path.join(rlt_path, "*_browser_sceenshot.png"))
        orig_png_bin_uuid: str = None
        if pdf_screen_files:
            with open(png_screenshot_files[0], "rb") as f:
                bin_png = f.read()
            bin_png_gz = gzip.compress(bin_png)
            orig_png_bin_uuid = md5_binary(bin_png)
            if save_doc:
                png_attach = BinaryAttachment(uuid=orig_png_bin_uuid,
                                              bin_data=bin_png_gz,
                                              file_ext="png",
                                              is_gzip=True,
                                              bin_length=len(bin_png_gz),
                                              ctime=datetime.now(),
                                              action_uuid=action_uuid)
                upsert_document(png_attach, False)

        # 存 GeneralBrowserActionInstance
        if save_doc:
            general_browser_action_doc = GeneralBrowserActionInstance(
                uuid=action_uuid,
                related_articles=related_articles,
                new_found_articles=new_found_articles,
                pdf_page_snapshot=BinaryAttachment(uuid=orig_pdf_bin_uuid),
                img_page_snapshot=BinaryAttachment(uuid=orig_png_bin_uuid))
            upsert_document(general_browser_action_doc, False)
Exemple #18
0
    def column_articles(rlt_path: str,
                        batch_action_uuid: str,
                        action_uuid: str,
                        save_doc: bool = True) -> List[Article]:
        rlt_articles = list()

        # region articles
        rlt_files = glob.glob(os.path.join(rlt_path, "*_articles.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                kw, df_rlt = SeekingAlphaDataProcess.proc_article_data(
                    df_from_csv)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()  # for Article
                    if not pd.isna(dict_row.get("title", np.NaN)):
                        dict_article["title"] = dict_row["title"]
                    if not pd.isna(dict_row.get("author_url", np.NaN)):
                        author_url: str = dict_row["author_url"]
                        author_id = author_url.split("/")[-1]
                        dict_article[
                            "seeking_alpha_author"] = AuthorInSeekingAlpha(
                                author_id=author_id, url=author_url)
                    if not pd.isna(dict_row.get("article_url", np.NaN)):
                        article_url = furl(dict_row["article_url"])
                        abs_url_str = f"{article_url.origin}{article_url.path}"
                        dict_article["full_text_url"] = abs_url_str
                        dict_article["uuid"] = md5_str(abs_url_str)
                    if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row[
                            "publish_time"].to_pydatetime()
                    dict_article["engine_site"] = "SeekingAlpha"
                    ls_related_symbols: List[FinancialInstrumentSymbol] = list(
                    )
                    for symbol_key_pair in [
                        ("related_symbol1", "related_symbol1_fullname"),
                        ("related_symbol2", "related_symbol2_fullname"),
                        ("related_symbol3", "related_symbol3_fullname")
                    ]:
                        if not pd.isna(dict_row.get(
                                symbol_key_pair[0], np.NaN)) and not pd.isna(
                                    dict_row.get(symbol_key_pair[1], np.NaN)):
                            fin_instrument_symbol = FinancialInstrumentSymbol(
                                symbol=dict_row[symbol_key_pair[0]],
                                full_name=dict_row[symbol_key_pair[1]],
                                batch_action_uuid=batch_action_uuid)
                            ls_related_symbols.append(fin_instrument_symbol)
                            # ListField(ReferenceField(FinancialInstrumentSymbol)) 似乎不会级联保存,这里创建的时候同时保存
                            if save_doc:
                                upsert_document(fin_instrument_symbol, True)
                    if ls_related_symbols:
                        dict_article["related_symbols"] = ls_related_symbols

                    if not pd.isna(dict_row.get("comments", np.NaN)):
                        dict_article[
                            "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                comments=dict_row["comments"])

                    dict_article["channel_in_site"] = kw
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)
        # endregion

        return rlt_articles