class GenStockNewsDB(object):

    def __init__(self):
        self.database = Database()
        # 获取从1990-12-19至2020-12-31股票交易日数据
        self.trade_date = ak.tool_trade_date_hist_sina()["trade_date"].tolist()
        self.label_range = {3: "3DaysLabel",
                            5: "5DaysLabel",
                            10: "10DaysLabel",
                            15: "15DaysLabel",
                            30: "30DaysLabel",
                            60: "60DaysLabel"}
        self.redis_client = redis.StrictRedis(host=config.REDIS_IP,
                                              port=config.REDIS_PORT,
                                              db=config.CACHE_NEWS_REDIS_DB_ID)
        self.redis_client.set("today_date", datetime.datetime.now().strftime("%Y-%m-%d"))
        self.redis_client.delete("stock_news_num_over_{}".format(config.MINIMUM_STOCK_NEWS_NUM_FOR_ML))
        self._stock_news_nums_stat()

    def get_all_news_about_specific_stock(self, database_name, collection_name):
        # 获取collection_name的key值,看是否包含RelatedStockCodes,如果没有说明,没有做将新闻中所涉及的
        # 股票代码保存在新的一列
        _keys_list = list(next(self.database.get_collection(database_name, collection_name).find()).keys())
        if "RelatedStockCodes" not in _keys_list:
            tokenization = Tokenization(import_module="jieba", user_dict="./Leorio/financedict.txt")
            tokenization.update_news_database_rows(database_name, collection_name)
        # 创建stock_code为名称的collection
        stock_symbol_list = self.database.get_data(config.STOCK_DATABASE_NAME,
                                                   config.COLLECTION_NAME_STOCK_BASIC_INFO,
                                                   keys=["symbol"])["symbol"].to_list()
        col_names = self.database.connect_database(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE).list_collection_names(session=None)
        for symbol in stock_symbol_list:
            if symbol not in col_names:
                # if int(symbol[2:]) > 837:
                _collection = self.database.get_collection(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, symbol)
                _tmp_num_stat = 0
                for row in self.database.get_collection(database_name, collection_name).find():  # 迭代器
                    if symbol[2:] in row["RelatedStockCodes"].split(" "):
                        # 返回新闻发布后n天的标签
                        _tmp_dict = {}
                        for label_days, key_name in self.label_range.items():
                            _tmp_res = self._label_news(
                                datetime.datetime.strptime(row["Date"].split(" ")[0], "%Y-%m-%d"), symbol, label_days)
                            _tmp_dict.update({key_name: _tmp_res})
                        _data = {"Date": row["Date"],
                                 "Url": row["Url"],
                                 "Title": row["Title"],
                                 "Article": row["Article"],
                                 "OriDB": database_name,
                                 "OriCOL": collection_name}
                        _data.update(_tmp_dict)
                        _collection.insert_one(_data)
                        _tmp_num_stat += 1
                logging.info("there are {} news mentioned {} in {} collection need to be fetched ... "
                             .format(_tmp_num_stat, symbol, collection_name))
            # else:
            #     logging.info("{} has fetched all related news from {}...".format(symbol, collection_name))

    def listen_redis_queue(self):
        # 监听redis消息队列,当新的实时数据过来时,根据"RelatedStockCodes"字段,将新闻分别保存到对应的股票数据库
        # e.g.:缓存新的一条数据中,"RelatedStockCodes"字段数据为"603386 603003 600111 603568",则将该条新闻分别
        # 都存进这四支股票对应的数据库中
        crawled_url_today = set()
        while True:
            date_now = datetime.datetime.now().strftime("%Y-%m-%d")
            if date_now != self.redis_client.get("today_date").decode():
                crawled_url_today = set()
                self.redis_client.set("today_date", date_now)
            if self.redis_client.llen(config.CACHE_NEWS_LIST_NAME) != 0:
                data = json.loads(self.redis_client.lindex(config.CACHE_NEWS_LIST_NAME, -1))
                if data["Url"] not in crawled_url_today:  # 排除重复插入冗余文本
                    crawled_url_today.update({data["Url"]})
                    if data["RelatedStockCodes"] != "":
                        for stock_code in data["RelatedStockCodes"].split(" "):
                            # 将新闻分别送进相关股票数据库
                            symbol = "sh{}".format(stock_code) if stock_code[0] == "6" else "sz{}".format(stock_code)
                            _collection = self.database.get_collection(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, symbol)
                            _tmp_dict = {}
                            for label_days, key_name in self.label_range.items():
                                _tmp_res = self._label_news(
                                    datetime.datetime.strptime(data["Date"].split(" ")[0], "%Y-%m-%d"), symbol, label_days)
                                _tmp_dict.update({key_name: _tmp_res})
                            _data = {"Date": data["Date"],
                                     "Url": data["Url"],
                                     "Title": data["Title"],
                                     "Article": data["Article"],
                                     "OriDB": data["OriDB"],
                                     "OriCOL": data["OriCOL"]}
                            _data.update(_tmp_dict)
                            _collection.insert_one(_data)
                            logging.info("the real-time fetched news {}, which was saved in [DB:{} - COL:{}] ...".format(data["Title"],
                                                                                                                         config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE,
                                                                                                                         symbol))
                            #
                            # if symbol.encode() in self.redis_client.lrange("stock_news_num_over_{}".format(config.MINIMUM_STOCK_NEWS_NUM_FOR_ML), 0, -1):
                            #     label_name = "3DaysLabel"
                            #     # classifier_save_path = "{}_classifier.pkl".format(symbol)
                            #     ori_dict_path = "{}_docs_dict.dict".format(symbol)
                            #     bowvec_save_path = "{}_bowvec.mm".format(symbol)
                            #
                            #     topicmodelling = TopicModelling()
                            #     chn_label = topicmodelling.classify_stock_news(data["Article"],
                            #                                                    config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE,
                            #                                                    symbol,
                            #                                                    label_name=label_name,
                            #                                                    topic_model_type="lsi",
                            #                                                    classifier_model="rdforest",  # rdforest / svm
                            #                                                    ori_dict_path=ori_dict_path,
                            #                                                    bowvec_save_path=bowvec_save_path)
                            #     logging.info(
                            #         "document '{}...' was classified with label '{}' for symbol {} ... ".format(
                            #             data["Article"][:20], chn_label, symbol))

                    self.redis_client.rpop(config.CACHE_NEWS_LIST_NAME)
                    logging.info("now pop {} from redis queue of [DB:{} - KEY:{}] ... ".format(data["Title"],
                                                                                               config.CACHE_NEWS_REDIS_DB_ID,
                                                                                               config.CACHE_NEWS_LIST_NAME))

    def _label_news(self, date, symbol, n_days):
        """
        :param date: 类型datetime.datetime,表示新闻发布的日期,只包括年月日,不包括具体时刻,如datetime.datetime(2015, 1, 5, 0, 0)
        :param symbol: 类型str,表示股票标的,如sh600000
        :param n_days: 类型int,表示根据多少天后的价格设定标签,如新闻发布后n_days天,如果收盘价格上涨,则认为该则新闻是利好消息
        """
        # 计算新闻发布当天经过n_days天后的具体年月日
        this_date_data = self.database.get_data(config.STOCK_DATABASE_NAME,
                                                symbol,
                                                query={"date": date})
        # 考虑情况:新闻发布日期是非交易日,因此该日期没有价格数据,则往前寻找,比如新闻发布日期是2020-12-12是星期六,
        # 则考虑2020-12-11日的收盘价作为该新闻发布时的数据
        tmp_date = date
        if this_date_data is None:
            i = 1
            while this_date_data is None and i <= 10:
                tmp_date -= datetime.timedelta(days=i)
                # 判断日期是否是交易日,如果是再去查询数据库;如果this_date_data还是NULL值,则说明数据库没有该交易日数据
                if tmp_date.strftime("%Y-%m-%d") in self.trade_date:
                    this_date_data = self.database.get_data(config.STOCK_DATABASE_NAME,
                                                            symbol,
                                                            query={"date": tmp_date})
                i += 1
        try:
            close_price_this_date = this_date_data["close"][0]
        except Exception:
            close_price_this_date = None
        # 考虑情况:新闻发布后n_days天是非交易日,或者没有采集到数据,因此向后寻找,如新闻发布日期是2020-12-08,5天
        # 后的日期是2020-12-13是周日,因此将2020-12-14日周一的收盘价作为n_days后的数据
        new_date = date + datetime.timedelta(days=n_days)
        n_days_later_data = self.database.get_data(config.STOCK_DATABASE_NAME,
                                                   symbol,
                                                   query={"date": new_date})
        if n_days_later_data is None:
            i = 1
            while n_days_later_data is None and i <= 10:
                new_date = date + datetime.timedelta(days=n_days+i)
                if new_date.strftime("%Y-%m-%d") in self.trade_date:
                    n_days_later_data = self.database.get_data(config.STOCK_DATABASE_NAME,
                                                               symbol,
                                                               query={"date": new_date})
                i += 1
        try:
            close_price_n_days_later = n_days_later_data["close"][0]
        except Exception:
            close_price_n_days_later = None
        # 判断条件:
        # (1)如果n_days个交易日后且n_days<=10天,则价格上涨(下跌)超过3%,则认为该新闻是利好(利空)消息;如果价格在3%的范围内,则为中性消息
        # (2)如果n_days个交易日后且10<n_days<=15天,则价格上涨(下跌)超过5%,则认为该新闻是利好(利空)消息;如果价格在5%的范围内,则为中性消息
        # (3)如果n_days个交易日后且15<n_days<=30天,则价格上涨(下跌)超过10%,则认为该新闻是利好(利空)消息;如果价格在10%的范围内,则为中性消息
        # (4)如果n_days个交易日后且30<n_days<=60天,则价格上涨(下跌)超过15%,则认为该新闻是利好(利空)消息;如果价格在15%的范围内,则为中性消息
        # Note:中性消息定义为,该消息迅速被市场消化,并没有持续性影响
        param = 0.01
        if n_days <= 10:
            param = 0.03
        elif 10 < n_days <= 15:
            param = 0.05
        elif 15 < n_days <= 30:
            param = 0.10
        elif 30 < n_days <= 60:
            param = 0.15
        if close_price_this_date is not None and close_price_n_days_later is not None:
            if (close_price_n_days_later - close_price_this_date) / close_price_this_date > param:
                return "利好"
            elif (close_price_n_days_later - close_price_this_date) / close_price_this_date < -param:
                return "利空"
            else:
                return "中性"
        else:
            return ""

    def _stock_news_nums_stat(self):
        cols_list = self.database.connect_database(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE).list_collection_names(session=None)
        for sym in cols_list:
            if self.database.get_collection(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, sym).estimated_document_count() > config.MINIMUM_STOCK_NEWS_NUM_FOR_ML:
                self.redis_client.lpush("stock_news_num_over_{}".format(config.MINIMUM_STOCK_NEWS_NUM_FOR_ML), sym)
Example #2
0
class GenStockNewsDB(object):
    def __init__(self):
        self.database = Database()
        # 获取从1990-12-19至2020-12-31股票交易日数据
        self.trade_date = ak.tool_trade_date_hist_sina()["trade_date"].tolist()
        self.label_range = {
            3: "3DaysLabel",
            5: "5DaysLabel",
            10: "10DaysLabel",
            15: "15DaysLabel",
            30: "30DaysLabel",
            60: "60DaysLabel"
        }

    def get_all_news_about_specific_stock(self, database_name,
                                          collection_name):
        # 获取collection_name的key值,看是否包含RelatedStockCodes,如果没有说明,没有做将新闻中所涉及的
        # 股票代码保存在新的一列
        _keys_list = list(
            next(
                self.database.get_collection(database_name,
                                             collection_name).find()).keys())
        if "RelatedStockCodes" not in _keys_list:
            tokenization = Tokenization(import_module="jieba",
                                        user_dict="./Leorio/financedict.txt")
            tokenization.update_news_database_rows(database_name,
                                                   collection_name)
        # 创建stock_code为名称的collection
        stock_symbol_list = self.database.get_data(
            "stock", "basic_info", keys=["symbol"])["symbol"].to_list()
        col_names = self.database.connect_database(
            "stocknews").list_collection_names(session=None)
        for symbol in stock_symbol_list:
            if symbol not in col_names:
                _collection = self.database.get_collection(
                    config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, symbol)
                _tmp_num_stat = 0
                for row in self.database.get_collection(
                        database_name, collection_name).find():  # 迭代器
                    if symbol[2:] in row["RelatedStockCodes"].split(" "):
                        # 返回新闻发布后n天的标签
                        _tmp_dict = {}
                        for label_days, key_name in self.label_range.items():
                            _tmp_res = self._label_news(
                                datetime.datetime.strptime(
                                    row["Date"].split(" ")[0], "%Y-%m-%d"),
                                symbol, label_days)
                            if _tmp_res:
                                _tmp_dict.update({key_name: _tmp_res})
                        _data = {
                            "Date": row["Date"],
                            "Url": row["Url"],
                            "Title": row["Title"],
                            "Article": row["Article"],
                            "OriDB": database_name,
                            "OriCOL": collection_name
                        }
                        _data.update(_tmp_dict)
                        _collection.insert_one(_data)
                        _tmp_num_stat += 1
                logging.info(
                    "there are {} news mentioned {} in {} collection need to be fetched ... "
                    .format(_tmp_num_stat, symbol, collection_name))
            else:
                logging.info(
                    "{} has fetched all related news from {}...".format(
                        symbol, collection_name))

    def _label_news(self, date, symbol, n_days):
        """
        :param date: 类型datetime.datetime,表示新闻发布的日期,只包括年月日,不包括具体时刻,如datetime.datetime(2015, 1, 5, 0, 0)
        :param symbol: 类型str,表示股票标的,如sh600000
        :param n_days: 类型int,表示根据多少天后的价格设定标签,如新闻发布后n_days天,如果收盘价格上涨,则认为该则新闻是利好消息
        """
        # 计算新闻发布当天经过n_days天后的具体年月日
        this_date_data = self.database.get_data(config.STOCK_DATABASE_NAME,
                                                symbol,
                                                query={"date": date})
        # 考虑情况:新闻发布日期是非交易日,因此该日期没有价格数据,则往前寻找,比如新闻发布日期是2020-12-12是星期六,
        # 则考虑2020-12-11日的收盘价作为该新闻发布时的数据
        tmp_date = date
        if this_date_data is None:
            i = 1
            while this_date_data is None and i <= 10:
                tmp_date -= datetime.timedelta(days=i)
                # 判断日期是否是交易日,如果是再去查询数据库;如果this_date_data还是NULL值,则说明数据库没有该交易日数据
                if tmp_date.strftime("%Y-%m-%d") in self.trade_date:
                    this_date_data = self.database.get_data(
                        config.STOCK_DATABASE_NAME,
                        symbol,
                        query={"date": tmp_date})
                i += 1
        try:
            close_price_this_date = this_date_data["close"][0]
        except Exception:
            close_price_this_date = None
        # 考虑情况:新闻发布后n_days天是非交易日,或者没有采集到数据,因此向后寻找,如新闻发布日期是2020-12-08,5天
        # 后的日期是2020-12-13是周日,因此将2020-12-14日周一的收盘价作为n_days后的数据
        new_date = date + datetime.timedelta(days=n_days)
        n_days_later_data = self.database.get_data(config.STOCK_DATABASE_NAME,
                                                   symbol,
                                                   query={"date": new_date})
        if n_days_later_data is None:
            i = 1
            while n_days_later_data is None and i <= 10:
                new_date = date + datetime.timedelta(days=n_days + i)
                if new_date.strftime("%Y-%m-%d") in self.trade_date:
                    n_days_later_data = self.database.get_data(
                        config.STOCK_DATABASE_NAME,
                        symbol,
                        query={"date": new_date})
                i += 1
        try:
            close_price_n_days_later = n_days_later_data["close"][0]
        except Exception:
            close_price_n_days_later = None
        # 判断条件:
        # (1)如果n_days个交易日后且n_days<=10天,则价格上涨(下跌)超过3%,则认为该新闻是利好(利空)消息;如果价格在3%的范围内,则为中性消息
        # (2)如果n_days个交易日后且10<n_days<=15天,则价格上涨(下跌)超过5%,则认为该新闻是利好(利空)消息;如果价格在5%的范围内,则为中性消息
        # (3)如果n_days个交易日后且15<n_days<=30天,则价格上涨(下跌)超过10%,则认为该新闻是利好(利空)消息;如果价格在10%的范围内,则为中性消息
        # (4)如果n_days个交易日后且30<n_days<=60天,则价格上涨(下跌)超过15%,则认为该新闻是利好(利空)消息;如果价格在15%的范围内,则为中性消息
        # Note:中性消息定义为,该消息迅速被市场消化,并没有持续性影响
        param = 0.01
        if n_days <= 10:
            param = 0.03
        elif 10 < n_days <= 15:
            param = 0.05
        elif 15 < n_days <= 30:
            param = 0.10
        elif 30 < n_days <= 60:
            param = 0.15
        if close_price_this_date is not None and close_price_n_days_later is not None:
            if (close_price_n_days_later -
                    close_price_this_date) / close_price_this_date > param:
                return "利好"
            elif (close_price_n_days_later -
                  close_price_this_date) / close_price_this_date < -param:
                return "利空"
            else:
                return "中性"
        else:
            return False