Beispiel #1
0
def upsert_chn_stock_name():
    tushare = TuShareProData()
    df = tushare.stock_basic(exchange="SSE", cols=["ts_code", "name"])
    for idx, row in df.iterrows():
        symbol = row["ts_code"].replace("SH", "SS")  # YAHOO 上海股市的后缀是 SS
        symbol_obj = FinancialInstrumentSymbol(symbol=symbol,
                                               full_name=row['name'])
        upsert_document(symbol_obj, False)

    df = tushare.stock_basic(exchange="SZSE", cols=["ts_code", "name"])
    for idx, row in df.iterrows():
        symbol = row["ts_code"]  # 深圳股市的后缀是相同的
        symbol_obj = FinancialInstrumentSymbol(symbol=symbol,
                                               full_name=row['name'])
        upsert_document(symbol_obj, False)
Beispiel #2
0
def find_equity(symbol_or_name: str) -> Optional[FinancialInstrumentSymbol]:
    """ 使用 symbol or name 在 FinancialInstrumentSymbol 中查询匹配的 symbol
        前提是该 symbol 已经填入过 yahoo 的基本信息内容(前置的环境准备)
    """
    ls_symbol_guess = [
        f"{symbol_or_name}{suffix}" for suffix in ["", ".SS", ".SZ", ".HK"]
    ]
    raw_query = {
        "_id": {
            "$in": ls_symbol_guess
        },
        "$or": [{
            "chn_name": symbol_or_name
        }, {
            "eng_name": symbol_or_name
        }],
        "info_from_yahoo": {
            "$exists": True
        }
    }

    symbol_obj = FinancialInstrumentSymbol.objects(
        (Q(symbol__in=ls_symbol_guess) | Q(chn_name=symbol_or_name)
         | Q(eng_name=symbol_or_name) | Q(full_name=symbol_or_name))
        & Q(info_from_yahoo__exists=True)).first()
    if symbol_obj is not None:
        return symbol_obj
    else:
        return None
Beispiel #3
0
def upsert_yahoo_recommend(symbol: str):
    yahoo_ts = yFinanceData()
    try:
        df = yahoo_ts.recommendations(symbol_to_yahoo_symbol(symbol))
        logger.info(f"upsert_yahoo_recommend : {symbol}-{df.shape}")
        for t, row in df.iterrows():
            firm = GlobalEntity(entity_id=row["Firm"])
            dict_info = {
                "t":
                t,
                "symbol":
                FinancialInstrumentSymbol(symbol=symbol),
                "firm":
                firm,
                "uid":
                md5_str(f"{t.isoformat()}-{symbol}-{row['Firm']}"),
                "to_grade":
                None if row["To Grade"] == "" else row["To Grade"],
                "from_grade":
                None if row["From Grade"] == "" else row["From Grade"],
                "action":
                None if row["Action"] == "" else row["Action"]
            }
            upsert_document(firm)
            upsert_document(FinancialInstrumentRecommendInYahoo(**dict_info),
                            False)
    except Exception as ex:
        logger.error(ex)
        return
Beispiel #4
0
    def kw_search(rlt_path: str,
                  batch_action_uuid: str,
                  action_uuid: str,
                  save_doc: bool = True) -> List[Article]:
        rlt_articles: List[Article] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_kw_search_result.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                kw, df_rlt = SeekingAlphaDataProcess.proc_kw_search(
                    df_from_csv)
                # print(df_rlt)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()
                    if not pd.isna(dict_row.get("title", np.NaN)):
                        dict_article["title"] = dict_row["title"]
                    if not pd.isna(dict_row.get("news_url", np.NaN)):
                        article_url = furl(dict_row["news_url"])
                        abs_url_str = f"{article_url.origin}{article_url.path}"
                        # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况
                        if abs_url_str.find("seekingalpha") > 0:
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        else:
                            dict_article["full_text_url"] = article_url.url
                            dict_article["uuid"] = md5_str(article_url.url)
                    if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row[
                            "publish_time"].to_pydatetime()
                    if not pd.isna(dict_row.get("symbols", np.NaN)):
                        symbols = [
                            x.strip()
                            for x in dict_row.get("symbols").split(",")
                        ]
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=x)
                            for x in symbols
                        ]
                        if save_doc:
                            map(upsert_document, ls_related_symbols)
                        dict_article["related_symbols"] = ls_related_symbols
                    dict_article["from_searching_phase"] = SearchingPhrase(
                        searching_phrase=kw)
                    dict_article["engine_site"] = "SeekingAlpha"
                    dict_article["channel_in_site"] = "Search"
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)

        return rlt_articles
Beispiel #5
0
def upsert_yahoo_financial_instrument_info(symbol: str,
                                           insert_only: bool = False):
    if insert_only:
        symbol_obj = FinancialInstrumentSymbol.objects(
            symbol=symbol, info_from_yahoo__exists=True).first()
        if symbol_obj is not None:
            return

    set_http_proxy()
    import yfinance as yf

    symbol_data = yf.Ticker(symbol_to_yahoo_symbol(symbol))
    try:
        dict_info = symbol_data.info
    except Exception as ex:
        logger.error(f"api exception when get {symbol} data , {ex}")
        return

    # remove : companyOfficers =  # <class 'list'>
    if "companyOfficers" in dict_info:
        del dict_info["companyOfficers"]

    # remove symbol , 重复数据
    if "symbol" in dict_info:
        del dict_info["symbol"]

    # rename "52WeekChange" -> "fiftyTwoWeekChange" , python 变量首字符不允许数字
    if "52WeekChange" in dict_info:
        dict_info["fiftyTwoWeekChange"] = dict_info["52WeekChange"]
        del dict_info["52WeekChange"]

    # rename "yield" -> "yieldVal" , python 关键字
    if "yield" in dict_info:
        dict_info["yieldVal"] = dict_info["yield"]
        del dict_info["yield"]
    if "err" in dict_info:
        del dict_info["err"]
    dict_info["mtime"] = datetime.now()

    fin_symbol = FinancialInstrumentSymbol(symbol=symbol)
    fin_symbol.info_from_yahoo = SymbolDataFromYahoo(**dict_info)
    upsert_document(fin_symbol)
Beispiel #6
0
def upsert_splits(symbol: str, start_t: datetime):
    yahoo_ts = yFinanceData()
    try:
        df = yahoo_ts.splits(symbol_to_yahoo_symbol(symbol), start=start_t)
        logger.info(f"upsert_splits : {symbol}-{df.shape}")
        for t, row in df.iterrows():
            dict_info = {
                "t": t,
                "split": row["Stock Splits"],
                "symbol": FinancialInstrumentSymbol(symbol=symbol),
                "uid": md5_str(f"{t.isoformat()}-{symbol}")
            }
            upsert_document(FinancialInstrumentDividends(**dict_info))
    except:
        return
Beispiel #7
0
def upsert_us_stock_nametable_and_chn_name():
    akshare_data = akshareData()
    df_us_stocks = akshare_data.get_us_stock_name()
    logger.info(f"table shape:{df_us_stocks.shape}")
    for idx, row in df_us_stocks.iterrows():
        if idx <= 9425:
            continue
        symbol = row["symbol"]
        chn_name = row["cname"]
        eng_name = row["name"]
        logger.info(f"[{idx}] {symbol} - {chn_name} - {eng_name}")
        fin_data = FinancialInstrumentSymbol(symbol=symbol,
                                             full_name=eng_name,
                                             eng_name=eng_name,
                                             chn_name=chn_name)
        upsert_document(fin_data)
        upsert_yahoo_financial_instrument_info(symbol, True)
Beispiel #8
0
def upsert_yahoo_symbol_holder(symbol: str):
    import yfinance as yf
    symbol_data = yf.Ticker(symbol_to_yahoo_symbol(symbol))
    try:
        df = symbol_data.institutional_holders
        logger.info(f"upsert_yahoo_symbol_holder : {symbol} {df.shape}")
        for idx, row in df.iterrows():
            t = row["Date Reported"]
            dict_info = {
                "t": t,
                "symbol": FinancialInstrumentSymbol(symbol=symbol),
                "holder": GlobalEntity(entity_id=row["Holder"]),
                "shares": row["Shares"],
                "value": row["Value"],
                "percentage_out": row["% Out"],
                "uid": md5_str(f"{t.isoformat()}-{symbol}-{row['Holder']}")
            }
            upsert_document(FinancialInstrumentHolders(**dict_info), True)
    except Exception as ex:
        logger.error(f"api exception when get data , {ex}")
Beispiel #9
0
def upsert_daily_market_data(symbol: str, start_t: datetime):
    yahoo_ts = yFinanceData()
    df = yahoo_ts.history(symbol_to_yahoo_symbol(symbol), start=start_t)
    logger.info(f"upsert_daily_market_data : {symbol}-{df.shape}")
    df["fifty_two_week_high"] = df["Close"].rolling(window=244).max()
    df["fifty_two_week_low"] = df["Close"].rolling(window=244).min()

    for t, row in df.iterrows():
        dict_info = {
            "t": t,
            "open": row["Open"],
            "high": row["High"],
            "low": row["Low"],
            "close": row["Close"],
            "volume": row["Volume"],
            "dividends": row["Dividends"],
            "splits": row["Stock Splits"],
            "fifty_two_week_low": row["fifty_two_week_low"],
            "fifty_two_week_high": row["fifty_two_week_high"],
            "symbol": FinancialInstrumentSymbol(symbol=symbol),
            "uid": md5_str(f"{t.isoformat()}-{symbol}")
        }
        upsert_document(FinancialInstrumentDailyMarketData(**dict_info))
Beispiel #10
0
def upsert_yahoo_earning_analysis(symbol: str):
    """see https://finance.yahoo.com/quote/MSFT/analysis?p=MSFT"""
    import yfinance as yf
    symbol_data = yf.Ticker(symbol_to_yahoo_symbol(symbol))
    try:
        df = symbol_data.calendar.T
        logger.info(f"upsert_yahoo_earning_analysis : {symbol}  {df.shape}")
        for idx, row in df.iterrows():
            t = row["Earnings Date"]
            dict_info = {
                "t": t,
                "symbol": FinancialInstrumentSymbol(symbol=symbol),
                "earnings_average": _convert_non_t_v(row["Earnings Average"]),
                "earnings_low": _convert_non_t_v(row["Earnings Low"]),
                "earnings_high": _convert_non_t_v(row["Earnings High"]),
                "revenue_average": _convert_non_t_v(row["Revenue Average"]),
                "revenue_low": _convert_non_t_v(row["Revenue Low"]),
                "revenue_high": _convert_non_t_v(row["Revenue High"]),
                "uid": md5_str(f"{t.isoformat()}-{symbol}")
            }
            upsert_document(FinancialInstrumentCalendarFromYahoo(**dict_info),
                            True)
    except Exception as ex:
        logger.error(f"api exception when get data , {ex}")
Beispiel #11
0
    def symbol_analysis_report(rlt_path: str,
                               batch_action_uuid: str,
                               action_uuid: str,
                               save_doc: bool = True) -> List[Article]:

        rlt_articles: List[Article] = list()

        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                df_rlt = EastMoneyDataProcess.proc_stock_analysis(df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("article", np.NaN)):
                            dict_article["title"] = dict_row["article"]
                        if not pd.isna(dict_row.get("article_url", np.NaN)):
                            dict_article["full_text_url"] = dict_row[
                                "article_url"]
                            dict_article["uuid"] = md5_str(
                                dict_row["article_url"])

                        if not pd.isna(dict_row.get("org_url", np.NaN)):
                            # 暂时先把机构名直接存在 seeking alpha 的 author 数据中,这样画图方便一些
                            author_url: str = dict_row["org_url"]
                            author_id = dict_row["org"]
                            # NO author_name extracted!!
                            # author_name = None
                            # if not pd.isna(dict_row.get("author", np.NaN)):
                            #     author_name = dict_row["author"]
                            author = AuthorInSeekingAlpha(author_id=author_id,
                                                          url=author_url)
                            if not pd.isna(
                                    dict_row.get("reports_within_one_month",
                                                 np.NaN)):
                                author.articles = dict_row[
                                    "reports_within_one_month"]

                            dict_article["seeking_alpha_author"] = author
                            if save_doc:
                                upsert_document(author, True)
                        if not pd.isna(dict_row.get("rating", np.NaN)):
                            dict_article["rating"] = dict_row["rating"]
                        if not pd.isna(dict_row.get("rating_chg", np.NaN)):
                            dict_article["rating_change"] = dict_row[
                                "rating_chg"]
                        if not pd.isna(dict_row.get("pred_2020_ret", np.NaN)):
                            dict_article["pred_ret_this_yr"] = dict_row[
                                "pred_2020_ret"]
                        if not pd.isna(dict_row.get("pred_2020_pe", np.NaN)):
                            dict_article["pred_pe_this_yr"] = dict_row[
                                "pred_2020_pe"]
                        if not pd.isna(dict_row.get("pred_2021_pe", np.NaN)):
                            dict_article["pred_pe_next_yr"] = dict_row[
                                "pred_2021_pe"]
                        if not pd.isna(dict_row.get("pred_2021_ret", np.NaN)):
                            dict_article["pred_ret_next_yr"] = dict_row[
                                "pred_2021_ret"]
                        if dict_row.get("report_date", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "report_date"].to_pydatetime()

                        symbol = dict_row["symbol"]
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=symbol)
                        ]
                        if save_doc:
                            ls_related_symbols[0].full_name = dict_row[
                                "symbol_name"]
                            upsert_document(ls_related_symbols[0])
                        dict_article["related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "EastMoney"
                        dict_article["channel_in_site"] = "StockAnalysis"
                        dict_article["batch_action_uuid"] = batch_action_uuid

                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        return rlt_articles
Beispiel #12
0
def add_equity_sentiment_actions(equity_symbol: FinancialInstrumentSymbol,
                                 batch_action_uuid: str) -> List[str]:
    assert equity_symbol is not None

    # NOTE: equity_symbol 的数据内容可能并不全,这里调用 reload 以便于能够加载全部数据
    equity_symbol.reload()

    ls_action_uuid_rlt: List[str] = list()

    # 以 company name 搜 google news
    kw = google_kws_or([
        corp_name_remove_stop_words(equity_symbol.chn_name),
        corp_name_remove_stop_words(equity_symbol.eng_name),
        equity_symbol.symbol
    ])
    act_uuid = add_one_equity_search_action(
        act_uuid=generate_uuid(),
        batch_action_uuid=batch_action_uuid,
        generate_func=add_equity_sentiment_actions,
        finished_triggered_func=gs_nlp_process,
        equity_symbol=equity_symbol,
        webapp_cfg=AllGeneralWebApp.GOOGLE_NEWS_TAB.value,
        kw=kw,
        additional_kw="",
        category="CorpNews",
        sub_category="GoogleNews",
        action_description=f"直接搜索公司相关新闻,避免财经站点的新闻采编出现遗漏的情况,kw={kw}")
    ls_action_uuid_rlt.append(act_uuid)

    # 有关评级的搜索
    ls_recommend = FinancialInstrumentRecommendInYahoo.objects(
        symbol=equity_symbol.symbol).order_by("-t")[:10]
    set_recommend_corp_name: Set[str] = set()
    for recommend in ls_recommend:
        firm = recommend.firm.fetch()
        recommend_corp_name = firm.name
        if not recommend_corp_name:
            recommend_corp_name = firm.entity_id
        if recommend_corp_name in set_recommend_corp_name:
            continue
        set_recommend_corp_name.add(recommend_corp_name)
        kw = google_kws_and([
            equity_symbol.symbol,
            corp_name_remove_stop_words(recommend_corp_name)
        ])
        description = f"'{equity_symbol.symbol}' achieve grade '{recommend.to_grade}'(pre '{recommend.from_grade}') by '{recommend_corp_name}' at {recommend.t}"
        act_uuid = add_one_equity_search_action(
            act_uuid=generate_uuid(),
            batch_action_uuid=batch_action_uuid,
            generate_func=add_equity_sentiment_actions,
            finished_triggered_func=gs_nlp_process,
            equity_symbol=equity_symbol,
            webapp_cfg=AllGeneralWebApp.GOOGLE_NEWS_TAB.value,
            kw=kw,
            additional_kw="",
            category="CorpNews",
            sub_category="CorpRecommendation",
            action_description=description)
        ls_action_uuid_rlt.append(act_uuid)
        # print(f"create google news query : {recommend.firm} + {equity_symbol.symbol}")

    # 有关机构持有人的搜索
    ls_holders = FinancialInstrumentHolders.objects(
        symbol=equity_symbol.symbol).order_by("-t")[:10]
    set_holder_corp_name: Set[str] = set()
    for holder in ls_holders:
        holder_corp_name = holder.holder.pk
        if holder_corp_name in set_holder_corp_name:
            continue
        kw = google_kws_and([
            equity_symbol.symbol,
            corp_name_remove_stop_words(holder_corp_name)
        ])
        description = f"{holder_corp_name} holds '{equity_symbol.symbol}' {holder.shares} shares , market_value={holder.value} , at {holder.t}"
        act_uuid = add_one_equity_search_action(
            act_uuid=generate_uuid(),
            batch_action_uuid=batch_action_uuid,
            generate_func=add_equity_sentiment_actions,
            finished_triggered_func=gs_nlp_process,
            equity_symbol=equity_symbol,
            webapp_cfg=AllGeneralWebApp.GOOGLE_NEWS_TAB.value,
            kw=kw,
            additional_kw="",
            category="CorpNews",
            sub_category="InstitutionHolderOpinion",
            action_description=description)
        ls_action_uuid_rlt.append(act_uuid)

    # 在雪球中用不同的关键词搜索
    # for additional_kw in [""] + CHN_KW_EQUITY_RELATED:  # 需叠加一个空白关键词,以便于拿到所有的新闻内容
    #     cfg_obj = AllGeneralWebApp.XUEQIU_NEWS.value
    #     act_uuid = add_one_equity_search_action(act_uuid=generate_uuid(), batch_action_uuid=batch_action_uuid,
    #                                             generate_func=add_equity_sentiment_actions,
    #                                             finished_triggered_func=gs_nlp_process,
    #                                             equity_symbol=equity_symbol,
    #                                             webapp_cfg=cfg_obj,
    #                                             kw=cfg_obj.symbol_func(equity_symbol.symbol),
    #                                             additional_kw=additional_kw,
    #                                             category="CorpNews", sub_category="ChnXueqiu",
    #                                             action_description=f"Search {equity_symbol.symbol} news with keyword '{additional_kw}' in xueqiu"
    #                                             )
    #     ls_action_uuid_rlt.append(act_uuid)
    return ls_action_uuid_rlt
Beispiel #13
0
    def symbol_summary(
        rlt_path: str,
        batch_action_uuid: str,
        action_uuid: str,
        save_doc: bool = True
    ) -> Tuple[List[FinancialInstrumentSymbol], List[Article]]:
        rlt_articles: List[Article] = list()
        rlt_symbols: List[FinancialInstrumentSymbol] = list()

        # region symbol analysis
        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_analysis.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_analysis(
                    df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("title", np.NaN)):
                            dict_article["title"] = dict_row["title"]
                        if not pd.isna(dict_row.get("article_url", np.NaN)):
                            article_url = furl(dict_row["article_url"])
                            abs_url_str = f"{article_url.origin}{article_url.path}"
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        if not pd.isna(dict_row.get("author_url", np.NaN)):
                            author_url: str = dict_row["author_url"]
                            author_id = dict_row["author_id"]
                            # NO author_name extracted!!
                            # author_name = None
                            # if not pd.isna(dict_row.get("author", np.NaN)):
                            #     author_name = dict_row["author"]
                            author = AuthorInSeekingAlpha(author_id=author_id,
                                                          url=author_url)
                            dict_article["seeking_alpha_author"] = author
                            if save_doc:
                                upsert_document(author, True)
                        if not pd.isna(dict_row.get("rating", np.NaN)):
                            dict_article["rating"] = dict_row["rating"]

                        if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "publish_time"].to_pydatetime()
                        if not pd.isna(dict_row.get("comments", np.NaN)):
                            dict_article[
                                "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                    comments=dict_row["comments"])
                        ls_related_symbols: List[FinancialInstrumentSymbol] = [
                            FinancialInstrumentSymbol(symbol=symbol)
                        ]
                        if save_doc:
                            upsert_document(ls_related_symbols[0])
                        dict_article["related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "SeekingAlpha"
                        dict_article["channel_in_site"] = "analysis"
                        dict_article["batch_action_uuid"] = batch_action_uuid

                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        # endregion

        # region symbol news

        rlt_files = glob.glob(os.path.join(rlt_path, "*_symbol_news.csv"))
        for i, f in enumerate(rlt_files):
            logger.info(f"proecess file : {f} ")
            df_from_csv = pd.read_csv(f, header=0, parse_dates=["extract_t"])
            symbol, df_rlt = SeekingAlphaDataProcess.proc_symbol_news(
                df_from_csv)

            for idx, row in df_rlt.iterrows():
                dict_row = row.to_dict()
                dict_article = dict()
                if not pd.isna(dict_row.get("title", np.NaN)):
                    dict_article["title"] = dict_row["title"]
                if not pd.isna(dict_row.get("news_url", np.NaN)):
                    article_url = furl(dict_row["news_url"])
                    abs_url_str = f"{article_url.origin}{article_url.path}"
                    # 仅 seeking alpha 内部的链接考虑去掉参数项,其他站点的行文,不确定url参数是否也构成了 unique 的情况
                    if abs_url_str.find("seekingalpha") > 0:
                        dict_article["full_text_url"] = abs_url_str
                        dict_article["uuid"] = md5_str(abs_url_str)
                    else:
                        dict_article["full_text_url"] = article_url.url
                        dict_article["uuid"] = md5_str(article_url.url)

                if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                    dict_article["publish_time"] = dict_row[
                        "publish_time"].to_pydatetime()
                if not pd.isna(dict_row.get("comments", np.NaN)):
                    dict_article[
                        "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                            comments=dict_row["comments"])
                ls_related_symbols: List[FinancialInstrumentSymbol] = [
                    FinancialInstrumentSymbol(symbol=symbol)
                ]
                if save_doc:
                    upsert_document(ls_related_symbols[0])
                dict_article["related_symbols"] = ls_related_symbols
                dict_article["engine_site"] = "SeekingAlpha"
                dict_article["batch_action_uuid"] = batch_action_uuid
                if not pd.isna(dict_row.get("orig_source", np.NaN)):
                    dict_article["channel_in_site"] = dict_row["orig_source"]

                article = Article(**dict_article)
                rlt_articles.append(article)
                if save_doc:
                    upsert_document(article, True)

        # endregion

        # region symbol info
        rlt_files = glob.glob(os.path.join(rlt_path,
                                           "*_symbol_indicators.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                dict_symbol_info = SeekingAlphaDataProcess.proc_symbol_indicator(
                    df_from_csv)
                if not dict_symbol_info:
                    continue
                if "symbol" not in dict_symbol_info:
                    continue
                symbol = FinancialInstrumentSymbol(
                    symbol=dict_symbol_info.get("symbol"),
                    info_from_seeking_alpha=SymbolInfoBySeekingAlpha(
                        followers=dict_symbol_info.get("followers", None),
                        high_52wk=dict_symbol_info.get("52wk high", None),
                        low_52wk=dict_symbol_info.get("52wk low", None),
                        eps_fwd=dict_symbol_info.get("EPS (FWD)", None),
                        pe_fwd=dict_symbol_info.get("PE (FWD)", None),
                        yield_fwd=dict_symbol_info.get("Yield (FWD)", None),
                        div_rate_fwd=dict_symbol_info.get(
                            "Div Rate (FWD)", None),
                        mkt_cap=dict_symbol_info.get("Market Cap", None),
                        volume=dict_symbol_info.get("Volume", None),
                        mtime=datetime.now()))
                if save_doc:
                    upsert_document(symbol, True)
                rlt_symbols.append(symbol)
        # endregion

        return rlt_symbols, rlt_articles
Beispiel #14
0
def all_chn_symbols() -> List[str]:
    symbols = FinancialInstrumentSymbol.objects(
        info_from_yahoo__market="cn_market").order_by("symbol").only("symbol")
    return [x.symbol for x in symbols]
Beispiel #15
0
    def author_detail(
        rlt_path: str,
        batch_action_uuid: str,
        action_uuid: str,
        save_doc: bool = True
    ) -> Tuple[List[AuthorInSeekingAlpha], List[Article]]:
        rlt_articles: List[Article] = list()
        rlt_authors: List[AuthorInSeekingAlpha] = list()

        # region author articles
        rlt_files = glob.glob(os.path.join(rlt_path, "*_author_articles.csv"))
        if rlt_files:
            for i, f in enumerate(rlt_files):
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                author_id, df_rlt = SeekingAlphaDataProcess.proc_author_articles(
                    df_from_csv)
                if df_rlt is not None:
                    for idx, row in df_rlt.iterrows():
                        dict_row = row.to_dict()
                        dict_article = dict()
                        if not pd.isna(dict_row.get("title", np.NaN)):
                            dict_article["title"] = dict_row["title"]
                        if not pd.isna(dict_row.get("news_url", np.NaN)):
                            article_url = furl(dict_row["news_url"])
                            abs_url_str = f"{article_url.origin}{article_url.path}"
                            dict_article["full_text_url"] = abs_url_str
                            dict_article["uuid"] = md5_str(abs_url_str)
                        if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                            dict_article["publish_time"] = dict_row[
                                "publish_time"].to_pydatetime()
                        if not pd.isna(dict_row.get("comments", np.NaN)):
                            dict_article[
                                "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                    comments=dict_row["comments"])
                        if not pd.isna(dict_row.get("symbols", np.NaN)):
                            ls_related_symbols: List[
                                FinancialInstrumentSymbol] = [
                                    FinancialInstrumentSymbol(symbol=s.strip())
                                    for s in dict_row["symbols"].split(",")
                                ]
                            if ls_related_symbols:
                                for symbol in ls_related_symbols:
                                    if save_doc:
                                        upsert_document(symbol, True)
                                dict_article[
                                    "related_symbols"] = ls_related_symbols
                        dict_article["engine_site"] = "SeekingAlpha"
                        dict_article["batch_action_uuid"] = batch_action_uuid
                        article = Article(**dict_article)
                        rlt_articles.append(article)
                        if save_doc:
                            upsert_document(article, True)

        # endregion

        # region author info
        rlt_files = glob.glob(os.path.join(rlt_path, "*_author_info.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f, header=0)
                dict_author_info = SeekingAlphaDataProcess.proc_author_info(
                    df_from_csv)
                if not dict_author_info:
                    continue
                if "author" not in dict_author_info:
                    continue
                author = AuthorInSeekingAlpha(
                    author_id=dict_author_info.get("author"),
                    intro=dict_author_info.get("author_intro", ""),
                    articles=dict_author_info.get("articles", None),
                    picks=dict_author_info.get("authors_picks", None),
                    blog_posts=dict_author_info.get("instablogs", None),
                    comments=dict_author_info.get("comments", None),
                    stock_talks=dict_author_info.get("stocktalks", None),
                    likes=dict_author_info.get("likes", None),
                    followers=dict_author_info.get("followers", None),
                    following=dict_author_info.get("following", None),
                    mtime=datetime.now(),
                    batch_action_uuid=batch_action_uuid)
                if save_doc:
                    upsert_document(author, True)
                rlt_authors.append(author)
        # endregion

        return rlt_authors, rlt_articles
Beispiel #16
0
def set_hk_stock_chn_name():
    for symbol, chn_name in ALL_HK_SYMBOLS_WITH_NAME:
        fin_data = FinancialInstrumentSymbol(symbol=symbol, full_name=chn_name)
        upsert_document(fin_data)
Beispiel #17
0
def set_symbol_important_flag(symbol: str):
    symbol_doc = FinancialInstrumentSymbol(symbol=symbol, important=True)
    upsert_document(symbol_doc)
Beispiel #18
0
def create_equity_workflow(req: WorkflowRequest):
    assert req.workflow_name in GSPredefinedWorkflow._value2member_map_

    equity_entity = find_equity(req.entity_str)
    if equity_entity is None:
        wf_batch_uuid = md5_str(
            f"{req.request_from_account}-{req.ctime.isoformat()}-{req.entity_str}"
        )
        doc_wf = TriggeredWebPagesCrawlWorkflow(
            uuid=wf_batch_uuid,
            workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
            para_begin=req.para_begin,
            para_end=req.para_begin,
            submit_account=req.request_from_account,
            submit_type=WorkflowSubmitType.HotKey.value,
            submit_time=req.ctime,
            finish_or_error_flag=WorkflowStatusFlag.WithError.value,
            error_msg=f"Can't find equity symbol or name by '{req.entity_str}'"
        )
        upsert_document(doc_wf, False)
        return

    # 找到了 entity, 生成 workflow 的内容
    wf_batch_uuid = md5_str(
        f"{equity_entity.symbol}-{req.workflow_name}-{req.para_begin}-{req.para_end}-{req.request_from_account}-{req.ctime.isoformat()}"
    )

    # 查询 workflow 预设的更新频率
    # wf_freq = "D"
    wf_freq = "1s"
    workflow_def = PredefinedWorkflow.objects(
        workflow_name=req.workflow_name).first()
    if workflow_def is not None:
        wf_freq = workflow_def.refresh_freq
    # 找一下该 symbol 的 workflow 最近一次的执行时间(假定 Per Symbol + Per Account)
    latest_workflow_inst = TriggeredWebPagesCrawlWorkflow.objects(
        fin_instrument=equity_entity.symbol,
        workflow=req.workflow_name,
        submit_account=req.request_from_account,
        finish_or_error_flag__in=[
            WorkflowStatusFlag.WaitToRun.value,
            WorkflowStatusFlag.SuccessFinished.value
        ]).order_by("-submit_time").first()
    # 如果在同一个周期的,直接记录一条错误的记录内容
    if latest_workflow_inst is not None and is_same_period(
            latest_workflow_inst.submit_time, req.ctime, wf_freq):
        logger.error(
            f"Workflow(uuid={latest_workflow_inst.uuid},ctime='{latest_workflow_inst.submit_time}') in the same period is existed."
        )
        doc_wf = TriggeredWebPagesCrawlWorkflow(
            uuid=wf_batch_uuid,
            main_entity_type=EntityType.Equity.value,
            fin_instrument=FinancialInstrumentSymbol(
                symbol=equity_entity.symbol),
            workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
            para_begin=req.para_begin,
            para_end=req.para_begin,
            submit_account=req.request_from_account,
            submit_type=WorkflowSubmitType.HotKey.value,
            submit_time=req.ctime,
            finish_or_error_flag=WorkflowStatusFlag.WithError.value,
            error_msg=
            f"workflow '{req.workflow_name}'({equity_entity.symbol}) is executed at {latest_workflow_inst.submit_time} . No need to rerun now."
        )
        upsert_document(doc_wf, False)
        return

    # 创建一个workflow
    doc_wf = TriggeredWebPagesCrawlWorkflow(
        uuid=wf_batch_uuid,
        main_entity_type=EntityType.Equity.value,
        fin_instrument=FinancialInstrumentSymbol(symbol=equity_entity.symbol),
        workflow=PredefinedWorkflow(workflow_name=req.workflow_name),
        para_begin=req.para_begin,
        para_end=req.para_begin,
        submit_account=req.request_from_account,
        submit_type=WorkflowSubmitType.HotKey.value,
        submit_time=req.ctime,
        finish_or_error_flag=WorkflowStatusFlag.WaitToRun.value)
    upsert_document(doc_wf, False)

    # 创建 batch action
    doc_batch_action = RPABatchAction(
        batch_id=wf_batch_uuid,
        is_dynamic_batch=True,
        from_function=cls_to_str(create_equity_workflow),
        ctime=req.ctime,
        status=ActionStatusFlag.WaitingForRun.value)
    upsert_document(doc_batch_action, False)

    # 依次调用 action generator 函数
    # NOTE : 这里是直接访问 diction , 以后改为调用函数,就可以支持 register 的功能
    for func in WORKFLOW_NAME_TO_ACTION_GENERATORS.get(req.workflow_name, []):
        func(equity_entity, wf_batch_uuid)
    logger.info(f"Batch action '{wf_batch_uuid}' is created.")
Beispiel #19
0
    def column_articles(rlt_path: str,
                        batch_action_uuid: str,
                        action_uuid: str,
                        save_doc: bool = True) -> List[Article]:
        rlt_articles = list()

        # region articles
        rlt_files = glob.glob(os.path.join(rlt_path, "*_articles.csv"))
        if rlt_files:
            for f in rlt_files:
                logger.info(f"proecess file : {f} ")
                df_from_csv = pd.read_csv(f,
                                          header=0,
                                          parse_dates=["extract_t"])
                kw, df_rlt = SeekingAlphaDataProcess.proc_article_data(
                    df_from_csv)
                for idx, row in df_rlt.iterrows():
                    dict_row = row.to_dict()
                    dict_article = dict()  # for Article
                    if not pd.isna(dict_row.get("title", np.NaN)):
                        dict_article["title"] = dict_row["title"]
                    if not pd.isna(dict_row.get("author_url", np.NaN)):
                        author_url: str = dict_row["author_url"]
                        author_id = author_url.split("/")[-1]
                        dict_article[
                            "seeking_alpha_author"] = AuthorInSeekingAlpha(
                                author_id=author_id, url=author_url)
                    if not pd.isna(dict_row.get("article_url", np.NaN)):
                        article_url = furl(dict_row["article_url"])
                        abs_url_str = f"{article_url.origin}{article_url.path}"
                        dict_article["full_text_url"] = abs_url_str
                        dict_article["uuid"] = md5_str(abs_url_str)
                    if dict_row.get("publish_time", pd.NaT) is not pd.NaT:
                        dict_article["publish_time"] = dict_row[
                            "publish_time"].to_pydatetime()
                    dict_article["engine_site"] = "SeekingAlpha"
                    ls_related_symbols: List[FinancialInstrumentSymbol] = list(
                    )
                    for symbol_key_pair in [
                        ("related_symbol1", "related_symbol1_fullname"),
                        ("related_symbol2", "related_symbol2_fullname"),
                        ("related_symbol3", "related_symbol3_fullname")
                    ]:
                        if not pd.isna(dict_row.get(
                                symbol_key_pair[0], np.NaN)) and not pd.isna(
                                    dict_row.get(symbol_key_pair[1], np.NaN)):
                            fin_instrument_symbol = FinancialInstrumentSymbol(
                                symbol=dict_row[symbol_key_pair[0]],
                                full_name=dict_row[symbol_key_pair[1]],
                                batch_action_uuid=batch_action_uuid)
                            ls_related_symbols.append(fin_instrument_symbol)
                            # ListField(ReferenceField(FinancialInstrumentSymbol)) 似乎不会级联保存,这里创建的时候同时保存
                            if save_doc:
                                upsert_document(fin_instrument_symbol, True)
                    if ls_related_symbols:
                        dict_article["related_symbols"] = ls_related_symbols

                    if not pd.isna(dict_row.get("comments", np.NaN)):
                        dict_article[
                            "seeking_alpha_extra"] = SeekingAlphaArticleExtra(
                                comments=dict_row["comments"])

                    dict_article["channel_in_site"] = kw
                    dict_article["batch_action_uuid"] = batch_action_uuid
                    article = Article(**dict_article)
                    rlt_articles.append(article)
                    if save_doc:
                        upsert_document(article, True)
        # endregion

        return rlt_articles