Example #1
0
    def get_url_info(self, url):
        try:
            bs = utils.html_parser(url)
        except Exception:
            return False
        span_list = bs.find_all("span")
        part = bs.find_all("p")
        article = ""
        date = ""
        for span in span_list:
            if "class" in span.attrs and span.text and span["class"] == [
                    "time"
            ]:
                string = span.text.split()
                for dt in string:
                    if dt.find("-") != -1:
                        date += dt + " "
                    elif dt.find(":") != -1:
                        date += dt
                break
        for paragraph in part:
            chn_status = utils.count_chn(str(paragraph))
            possible = chn_status[1]
            if possible > self.is_article_prob:
                article += str(paragraph)
        while article.find("<") != -1 and article.find(">") != -1:
            string = article[article.find("<"):article.find(">") + 1]
            article = article.replace(string, "")
        while article.find("\u3000") != -1:
            article = article.replace("\u3000", "")
        article = " ".join(re.split(" +|\n+", article)).strip()

        return [date, article]
    def get_url_info(self, url, specific_date):
        try:
            bs = utils.html_parser(url)
        except Exception:
            return False
        date = ""
        for span in bs.find_all("span"):
            if span.contents[0] == "jrj_final_date_start":
                date = span.text.replace("\r", "").replace("\n", "")
                break
        if date == "":
            date = specific_date
        article = ""
        for p in bs.find_all("p"):
            if not p.find_all("jrj_final_daohang_start") and p.attrs == {} and \
                    not p.find_all("input") and not p.find_all("a", attrs={"class": "red"}) and not p.find_all("i") and not p.find_all("span"):
                # if p.contents[0] != "jrj_final_daohang_start1" and p.attrs == {} and \
                #         not p.find_all("input") and not p.find_all("a", attrs={"class": "red"}) and not p.find_all("i"):
                article += p.text.replace("\r", "").replace("\n", "").replace("\u3000", "")

        return [date, article]
Example #3
0
 def get_historical_news(self, start_page=684):
     date_list = self.db_obj.get_data(self.db_name,
                                      self.col_name,
                                      keys=["Date"])["Date"].to_list()
     name_code_df = self.db_obj.get_data(
         config.STOCK_DATABASE_NAME,
         config.COLLECTION_NAME_STOCK_BASIC_INFO,
         keys=["name", "code"])
     name_code_dict = dict(name_code_df.values)
     if len(date_list) == 0:
         # 说明没有历史数据,从头开始爬取
         crawled_urls_list = []
         page_urls = [
             "{}/{}".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD, page_id)
             for page_id in range(start_page, 0, -1)
         ]
         for page_url in page_urls:
             bs = utils.html_parser(page_url)
             a_list = bs.find_all("a")
             for a in a_list:
                 if "click-statistic" in a.attrs and a.string \
                         and a["click-statistic"].find("Article_") != -1 \
                         and a["href"].find("http://www.nbd.com.cn/articles/") != -1:
                     if a["href"] not in crawled_urls_list:
                         result = self.get_url_info(a["href"])
                         while not result:
                             self.terminated_amount += 1
                             if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                                 # 始终无法爬取的URL保存起来
                                 with open(
                                         config.
                                         RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                         "a+") as file:
                                     file.write("{}\n".format(a["href"]))
                                 logging.info(
                                     "rejected by remote server longer than {} minutes, "
                                     "and the failed url has been written in path {}"
                                     .format(
                                         config.NBD_MAX_REJECTED_AMOUNTS,
                                         config.
                                         RECORD_NBD_FAILED_URL_TXT_FILE_PATH
                                     ))
                                 break
                             logging.info(
                                 "rejected by remote server, request {} again after "
                                 "{} seconds...".format(
                                     a["href"],
                                     60 * self.terminated_amount))
                             time.sleep(60 * self.terminated_amount)
                             result = self.get_url_info(a["href"])
                         if not result:
                             # 爬取失败的情况
                             logging.info("[FAILED] {} {}".format(
                                 a.string, a["href"]))
                         else:
                             # 有返回但是article为null的情况
                             date, article = result
                             while article == "" and self.is_article_prob >= .1:
                                 self.is_article_prob -= .1
                                 result = self.get_url_info(a["href"])
                                 while not result:
                                     self.terminated_amount += 1
                                     if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                                         # 始终无法爬取的URL保存起来
                                         with open(
                                                 config.
                                                 RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                                 "a+") as file:
                                             file.write("{}\n".format(
                                                 a["href"]))
                                         logging.info(
                                             "rejected by remote server longer than {} minutes, "
                                             "and the failed url has been written in path {}"
                                             .format(
                                                 config.
                                                 NBD_MAX_REJECTED_AMOUNTS,
                                                 config.
                                                 RECORD_NBD_FAILED_URL_TXT_FILE_PATH
                                             ))
                                         break
                                     logging.info(
                                         "rejected by remote server, request {} again after "
                                         "{} seconds...".format(
                                             a["href"],
                                             60 * self.terminated_amount))
                                     time.sleep(60 * self.terminated_amount)
                                     result = self.get_url_info(a["href"])
                                 date, article = result
                             self.is_article_prob = .5
                             if article != "":
                                 related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(
                                     article, name_code_dict)
                                 data = {
                                     "Date":
                                     date,
                                     # "PageId": page_url.split("/")[-1],
                                     "Url":
                                     a["href"],
                                     "Title":
                                     a.string,
                                     "Article":
                                     article,
                                     "RelatedStockCodes":
                                     " ".join(related_stock_codes_list)
                                 }
                                 # self.col.insert_one(data)
                                 self.db_obj.insert_data(
                                     self.db_name, self.col_name, data)
                                 logging.info("[SUCCESS] {} {} {}".format(
                                     date, a.string, a["href"]))
     else:
         is_stop = False
         start_date = max(date_list)
         page_start_id = 1
         while not is_stop:
             page_url = "{}/{}".format(
                 config.WEBSITES_LIST_TO_BE_CRAWLED_NBD, page_start_id)
             bs = utils.html_parser(page_url)
             a_list = bs.find_all("a")
             for a in a_list:
                 if "click-statistic" in a.attrs and a.string \
                         and a["click-statistic"].find("Article_") != -1 \
                         and a["href"].find("http://www.nbd.com.cn/articles/") != -1:
                     result = self.get_url_info(a["href"])
                     while not result:
                         self.terminated_amount += 1
                         if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                             # 始终无法爬取的URL保存起来
                             with open(
                                     config.
                                     RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                     "a+") as file:
                                 file.write("{}\n".format(a["href"]))
                             logging.info(
                                 "rejected by remote server longer than {} minutes, "
                                 "and the failed url has been written in path {}"
                                 .format(
                                     config.NBD_MAX_REJECTED_AMOUNTS,
                                     config.
                                     RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
                             break
                         logging.info(
                             "rejected by remote server, request {} again after "
                             "{} seconds...".format(
                                 a["href"], 60 * self.terminated_amount))
                         time.sleep(60 * self.terminated_amount)
                         result = self.get_url_info(a["href"])
                     if not result:
                         # 爬取失败的情况
                         logging.info("[FAILED] {} {}".format(
                             a.string, a["href"]))
                     else:
                         # 有返回但是article为null的情况
                         date, article = result
                         if date > start_date:
                             while article == "" and self.is_article_prob >= .1:
                                 self.is_article_prob -= .1
                                 result = self.get_url_info(a["href"])
                                 while not result:
                                     self.terminated_amount += 1
                                     if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                                         # 始终无法爬取的URL保存起来
                                         with open(
                                                 config.
                                                 RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                                 "a+") as file:
                                             file.write("{}\n".format(
                                                 a["href"]))
                                         logging.info(
                                             "rejected by remote server longer than {} minutes, "
                                             "and the failed url has been written in path {}"
                                             .format(
                                                 config.
                                                 NBD_MAX_REJECTED_AMOUNTS,
                                                 config.
                                                 RECORD_NBD_FAILED_URL_TXT_FILE_PATH
                                             ))
                                         break
                                     logging.info(
                                         "rejected by remote server, request {} again after "
                                         "{} seconds...".format(
                                             a["href"],
                                             60 * self.terminated_amount))
                                     time.sleep(60 * self.terminated_amount)
                                     result = self.get_url_info(a["href"])
                                 date, article = result
                             self.is_article_prob = .5
                             if article != "":
                                 related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(
                                     article, name_code_dict)
                                 data = {
                                     "Date":
                                     date,
                                     "Url":
                                     a["href"],
                                     "Title":
                                     a.string,
                                     "Article":
                                     article,
                                     "RelatedStockCodes":
                                     " ".join(related_stock_codes_list)
                                 }
                                 self.db_obj.insert_data(
                                     self.db_name, self.col_name, data)
                                 logging.info("[SUCCESS] {} {} {}".format(
                                     date, a.string, a["href"]))
                         else:
                             is_stop = True
                             break
             if not is_stop:
                 page_start_id += 1
Example #4
0
 def get_realtime_news(self, interval=60):
     page_url = "{}/1".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD)
     logging.info(
         "start real-time crawling of URL -> {}, request every {} secs ... "
         .format(page_url, interval))
     name_code_df = self.db_obj.get_data(
         config.STOCK_DATABASE_NAME,
         config.COLLECTION_NAME_STOCK_BASIC_INFO,
         keys=["name", "code"])
     name_code_dict = dict(name_code_df.values)
     crawled_urls = []
     date_list = self.db_obj.get_data(self.db_name,
                                      self.col_name,
                                      keys=["Date"])["Date"].to_list()
     latest_date = max(date_list)
     while True:
         # 每隔一定时间轮询该网址
         if len(crawled_urls) > 100:
             # 防止list过长,内存消耗大,维持list在100条
             crawled_urls.pop(0)
         bs = utils.html_parser(page_url)
         a_list = bs.find_all("a")
         for a in a_list:
             if "click-statistic" in a.attrs and a.string \
                     and a["click-statistic"].find("Article_") != -1 \
                     and a["href"].find("http://www.nbd.com.cn/articles/") != -1:
                 if a["href"] not in crawled_urls:
                     result = self.get_url_info(a["href"])
                     while not result:
                         self.terminated_amount += 1
                         if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                             # 始终无法爬取的URL保存起来
                             with open(
                                     config.
                                     RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                     "a+") as file:
                                 file.write("{}\n".format(a["href"]))
                             logging.info(
                                 "rejected by remote server longer than {} minutes, "
                                 "and the failed url has been written in path {}"
                                 .format(
                                     config.NBD_MAX_REJECTED_AMOUNTS,
                                     config.
                                     RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
                             break
                         logging.info(
                             "rejected by remote server, request {} again after "
                             "{} seconds...".format(
                                 a["href"], 60 * self.terminated_amount))
                         time.sleep(60 * self.terminated_amount)
                         result = self.get_url_info(a["href"])
                     if not result:
                         # 爬取失败的情况
                         logging.info("[FAILED] {} {}".format(
                             a.string, a["href"]))
                     else:
                         # 有返回但是article为null的情况
                         date, article = result
                         if date > latest_date:
                             while article == "" and self.is_article_prob >= .1:
                                 self.is_article_prob -= .1
                                 result = self.get_url_info(a["href"])
                                 while not result:
                                     self.terminated_amount += 1
                                     if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                                         # 始终无法爬取的URL保存起来
                                         with open(
                                                 config.
                                                 RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                                 "a+") as file:
                                             file.write("{}\n".format(
                                                 a["href"]))
                                         logging.info(
                                             "rejected by remote server longer than {} minutes, "
                                             "and the failed url has been written in path {}"
                                             .format(
                                                 config.
                                                 NBD_MAX_REJECTED_AMOUNTS,
                                                 config.
                                                 RECORD_NBD_FAILED_URL_TXT_FILE_PATH
                                             ))
                                         break
                                     logging.info(
                                         "rejected by remote server, request {} again after "
                                         "{} seconds...".format(
                                             a["href"],
                                             60 * self.terminated_amount))
                                     time.sleep(60 * self.terminated_amount)
                                     result = self.get_url_info(a["href"])
                                 date, article = result
                             self.is_article_prob = .5
                             if article != "":
                                 related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(
                                     article, name_code_dict)
                                 data = {
                                     "Date":
                                     date,
                                     # "PageId": page_url.split("/")[-1],
                                     "Url":
                                     a["href"],
                                     "Title":
                                     a.string,
                                     "Article":
                                     article,
                                     "RelatedStockCodes":
                                     " ".join(related_stock_codes_list)
                                 }
                                 # self.col.insert_one(data)
                                 self.db_obj.insert_data(
                                     self.db_name, self.col_name, data)
                                 crawled_urls.append(a["href"])
                                 logging.info("[SUCCESS] {} {} {}".format(
                                     date, a.string, a["href"]))
         # logging.info("sleep {} secs then request again ... ".format(interval))
         time.sleep(interval)
    def get_realtime_news(self, interval=60):
        page_url = "{}/1".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD)
        logging.info(
            "start real-time crawling of URL -> {}, request every {} secs ... "
            .format(page_url, interval))
        name_code_df = self.db_obj.get_data(
            config.STOCK_DATABASE_NAME,
            config.COLLECTION_NAME_STOCK_BASIC_INFO,
            keys=["name", "code"])
        name_code_dict = dict(name_code_df.values)
        crawled_urls = []
        date_list = self.db_obj.get_data(self.db_name,
                                         self.col_name,
                                         keys=["Date"])["Date"].to_list()
        latest_date = max(date_list)
        while True:
            # 每隔一定时间轮询该网址
            if len(crawled_urls) > 100:
                # 防止list过长,内存消耗大,维持list在100条
                crawled_urls.pop(0)
            bs = utils.html_parser(page_url)
            a_list = bs.find_all("a")
            for a in a_list:
                if "click-statistic" in a.attrs and a.string \
                        and a["click-statistic"].find("Article_") != -1 \
                        and a["href"].find("http://www.nbd.com.cn/articles/") != -1:
                    if a["href"] not in crawled_urls:
                        result = self.get_url_info(a["href"])
                        while not result:
                            self.terminated_amount += 1
                            if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                                # 始终无法爬取的URL保存起来
                                with open(
                                        config.
                                        RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                        "a+") as file:
                                    file.write("{}\n".format(a["href"]))
                                logging.info(
                                    "rejected by remote server longer than {} minutes, "
                                    "and the failed url has been written in path {}"
                                    .format(
                                        config.NBD_MAX_REJECTED_AMOUNTS,
                                        config.
                                        RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
                                break
                            logging.info(
                                "rejected by remote server, request {} again after "
                                "{} seconds...".format(
                                    a["href"], 60 * self.terminated_amount))
                            time.sleep(60 * self.terminated_amount)
                            result = self.get_url_info(a["href"])
                        if not result:
                            # 爬取失败的情况
                            logging.info("[FAILED] {} {}".format(
                                a.string, a["href"]))
                        else:
                            # 有返回但是article为null的情况
                            date, article = result
                            if date > latest_date:
                                while article == "" and self.is_article_prob >= .1:
                                    self.is_article_prob -= .1
                                    result = self.get_url_info(a["href"])
                                    while not result:
                                        self.terminated_amount += 1
                                        if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                                            # 始终无法爬取的URL保存起来
                                            with open(
                                                    config.
                                                    RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                                    "a+") as file:
                                                file.write("{}\n".format(
                                                    a["href"]))
                                            logging.info(
                                                "rejected by remote server longer than {} minutes, "
                                                "and the failed url has been written in path {}"
                                                .format(
                                                    config.
                                                    NBD_MAX_REJECTED_AMOUNTS,
                                                    config.
                                                    RECORD_NBD_FAILED_URL_TXT_FILE_PATH
                                                ))
                                            break
                                        logging.info(
                                            "rejected by remote server, request {} again after "
                                            "{} seconds...".format(
                                                a["href"],
                                                60 * self.terminated_amount))
                                        time.sleep(60 * self.terminated_amount)
                                        result = self.get_url_info(a["href"])
                                    date, article = result
                                self.is_article_prob = .5
                                if article != "":
                                    related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(
                                        article, name_code_dict)
                                    self.db_obj.insert_data(
                                        self.db_name,
                                        self.col_name,
                                        {
                                            "Date":
                                            date,
                                            # "PageId": page_url.split("/")[-1],
                                            "Url":
                                            a["href"],
                                            "Title":
                                            a.string,
                                            "Article":
                                            article,
                                            "RelatedStockCodes":
                                            " ".join(related_stock_codes_list)
                                        })
                                    self.redis_client.lpush(
                                        config.CACHE_NEWS_LIST_NAME,
                                        json.dumps({
                                            "Date":
                                            date,
                                            # "PageId": page_url.split("/")[-1],
                                            "Url":
                                            a["href"],
                                            "Title":
                                            a.string,
                                            "Article":
                                            article,
                                            "RelatedStockCodes":
                                            " ".join(related_stock_codes_list),
                                            "OriDB":
                                            config.DATABASE_NAME,
                                            "OriCOL":
                                            config.COLLECTION_NAME_NBD
                                        }))
                                    crawled_urls.append(a["href"])
                                    logging.info("[SUCCESS] {} {} {}".format(
                                        date, a.string, a["href"]))
            # logging.info("sleep {} secs then request again ... ".format(interval))
            time.sleep(interval)


# """
# Example-1:
# 爬取历史新闻数据
# """
# if __name__ == "__main__":
#     nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
#     nbd_spyder.get_historical_news(start_page=684)
#
#     Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
#     DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()

# """
# Example-2:
# 爬取实时新闻数据
# """
# if __name__ == '__main__':
#     from Kite import config
#
#     from Killua.denull import DeNull
#     from Killua.deduplication import Deduplication
#
#     from Gon.nbdspyder import NbdSpyder
#
#     # 如果没有历史数据从头爬取,如果已爬取历史数据,则从最新的时间开始爬取
#     # 如历史数据中最近的新闻时间是"2020-12-09 20:37:10",则从该时间开始爬取
#     nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
#     nbd_spyder.get_historical_news()
#
#     Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
#     DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
#
#     nbd_spyder.get_realtime_news()
Example #6
0
    def get_historical_news(self, url, start_date, end_date):
        # # 抽取数据库中已爬取的从start_date到latest_date_str所有新闻,避免重复爬取
        # # 比如数据断断续续爬到了2016-10-10 15:00:00时间节点,但是在没调整参数的情
        # # 况下,从2015-01-01(自己设定)开始重跑程序会导致大量重复数据,因此在这里稍
        # # 作去重。直接从最新的时间节点开始跑是完全没问题,但从2015-01-01(自己设定)
        # # 开始重跑程序可以尝试将前面未成功爬取的URL重新再试一遍
        # extracted_data_list = self.extract_data(["Date"])[0]
        # if len(extracted_data_list) != 0:
        #     latest_date_str = max(extracted_data_list).split(" ")[0]
        # else:
        #     latest_date_str = start_date
        # logging.info("latest time in database is {} ... ".format(latest_date_str))
        # crawled_urls_list = list()
        # for _date in utils.get_date_list_from_range(start_date, latest_date_str):
        #     query_results = self.query_news("Date", _date)
        #     for qr in query_results:
        #         crawled_urls_list.append(qr["Url"])
        # # crawled_urls_list = self.extract_data(["Url"])[0]  # abandoned
        # logging.info("the length of crawled data from {} to {} is {} ... ".format(start_date,
        #                                                                           latest_date_str,
        #                                                                           len(crawled_urls_list)))

        crawled_urls_list = []
        dates_list = utils.get_date_list_from_range(start_date, end_date)
        dates_separated_into_ranges_list = utils.gen_dates_list(
            dates_list, config.JRJ_DATE_RANGE)

        for dates_range in dates_separated_into_ranges_list:
            for date in dates_range:
                first_url = "{}/{}/{}_1.shtml".format(
                    url,
                    date.replace("-", "")[0:6], date.replace("-", ""))
                max_pages_num = utils.search_max_pages_num(first_url, date)
                for num in range(1, max_pages_num + 1):
                    _url = "{}/{}/{}_{}.shtml".format(
                        url,
                        date.replace("-", "")[0:6], date.replace("-", ""),
                        str(num))
                    bs = utils.html_parser(_url)
                    a_list = bs.find_all("a")
                    for a in a_list:
                        if "href" in a.attrs and a.string and \
                                a["href"].find("/{}/{}/".format(date.replace("-", "")[:4],
                                                                date.replace("-", "")[4:6])) != -1:
                            if a["href"] not in crawled_urls_list:
                                # 如果标题不包含"收盘","报于"等字样,即可写入数据库,因为包含这些字样标题的新闻多为机器自动生成
                                if a.string.find("收盘") == -1 and a.string.find("报于") == -1 and \
                                        a.string.find("新三板挂牌上市") == -1:
                                    result = self.get_url_info(a["href"], date)
                                    while not result:
                                        self.terminated_amount += 1
                                        if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
                                            # 始终无法爬取的URL保存起来
                                            with open(
                                                    config.
                                                    RECORD_JRJ_FAILED_URL_TXT_FILE_PATH,
                                                    "a+") as file:
                                                file.write("{}\n".format(
                                                    a["href"]))
                                            logging.info(
                                                "rejected by remote server longer than {} minutes, "
                                                "and the failed url has been written in path {}"
                                                .format(
                                                    config.
                                                    JRJ_MAX_REJECTED_AMOUNTS,
                                                    config.
                                                    RECORD_JRJ_FAILED_URL_TXT_FILE_PATH
                                                ))
                                            break
                                        logging.info(
                                            "rejected by remote server, request {} again after "
                                            "{} seconds...".format(
                                                a["href"],
                                                60 * self.terminated_amount))
                                        time.sleep(60 * self.terminated_amount)
                                        result = self.get_url_info(
                                            a["href"], date)
                                    if not result:
                                        # 爬取失败的情况
                                        logging.info("[FAILED] {} {}".format(
                                            a.string, a["href"]))
                                    else:
                                        # 有返回但是article为null的情况
                                        article_specific_date, article = result
                                        while article == "" and self.is_article_prob >= .1:
                                            self.is_article_prob -= .1
                                            result = self.get_url_info(
                                                a["href"], date)
                                            while not result:
                                                self.terminated_amount += 1
                                                if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
                                                    # 始终无法爬取的URL保存起来
                                                    with open(
                                                            config.
                                                            RECORD_JRJ_FAILED_URL_TXT_FILE_PATH,
                                                            "a+") as file:
                                                        file.write(
                                                            "{}\n".format(
                                                                a["href"]))
                                                    logging.info(
                                                        "rejected by remote server longer than {} minutes, "
                                                        "and the failed url has been written in path {}"
                                                        .format(
                                                            config.
                                                            JRJ_MAX_REJECTED_AMOUNTS,
                                                            config.
                                                            RECORD_JRJ_FAILED_URL_TXT_FILE_PATH
                                                        ))
                                                    break
                                                logging.info(
                                                    "rejected by remote server, request {} again after "
                                                    "{} seconds...".format(
                                                        a["href"], 60 *
                                                        self.terminated_amount)
                                                )
                                                time.sleep(
                                                    60 *
                                                    self.terminated_amount)
                                                result = self.get_url_info(
                                                    a["href"], date)
                                            article_specific_date, article = result
                                        self.is_article_prob = .5
                                        if article != "":
                                            data = {
                                                "Date": article_specific_date,
                                                "Url": a["href"],
                                                "Title": a.string,
                                                "Article": article
                                            }
                                            # self.col.insert_one(data)
                                            self.db_obj.insert_data(
                                                self.db_name, self.col_name,
                                                data)
                                            logging.info(
                                                "[SUCCESS] {} {} {}".format(
                                                    article_specific_date,
                                                    a.string, a["href"]))
                                    self.terminated_amount = 0  # 爬取结束后重置该参数
                                else:
                                    logging.info("[QUIT] {}".format(a.string))
Example #7
0
    def get_historical_news(self, start_page):
        extracted_data_list = self.extract_data(["PageId"])[0]
        if len(extracted_data_list) != 0:
            latest_page_id = min(extracted_data_list)
        else:
            latest_page_id = start_page
        crawled_urls_list = list()
        for page_id in range(start_page, int(latest_page_id) - 1, -1):
            query_results = self.query_news("PageId", page_id)
            for qr in query_results:
                crawled_urls_list.append(qr["Url"])
        # crawled_urls_list = self.extract_data(["Url"])[0]  # abandoned
        logging.info(
            "the length of crawled data from page {} to page {} is {} ... ".
            format(start_page, latest_page_id, len(crawled_urls_list)))

        page_urls = [
            "{}/{}".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD, page_id)
            for page_id in range(start_page, 0, -1)
        ]
        for page_url in page_urls:
            bs = utils.html_parser(page_url)
            a_list = bs.find_all("a")
            for a in a_list:
                if "click-statistic" in a.attrs and a.string \
                        and a["click-statistic"].find("Article_") != -1 \
                        and a["href"].find("http://www.nbd.com.cn/articles/") != -1:
                    if a["href"] not in crawled_urls_list:
                        result = self.get_url_info(a["href"])
                        while not result:
                            self.terminated_amount += 1
                            if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                                # 始终无法爬取的URL保存起来
                                with open(
                                        config.
                                        RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                        "a+") as file:
                                    file.write("{}\n".format(a["href"]))
                                logging.info(
                                    "rejected by remote server longer than {} minutes, "
                                    "and the failed url has been written in path {}"
                                    .format(
                                        config.NBD_MAX_REJECTED_AMOUNTS,
                                        config.
                                        RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
                                break
                            logging.info(
                                "rejected by remote server, request {} again after "
                                "{} seconds...".format(
                                    a["href"], 60 * self.terminated_amount))
                            time.sleep(60 * self.terminated_amount)
                            result = self.get_url_info(a["href"])
                        if not result:
                            # 爬取失败的情况
                            logging.info("[FAILED] {} {}".format(
                                a.string, a["href"]))
                        else:
                            # 有返回但是article为null的情况
                            date, article = result
                            while article == "" and self.is_article_prob >= .1:
                                self.is_article_prob -= .1
                                result = self.get_url_info(a["href"])
                                while not result:
                                    self.terminated_amount += 1
                                    if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
                                        # 始终无法爬取的URL保存起来
                                        with open(
                                                config.
                                                RECORD_NBD_FAILED_URL_TXT_FILE_PATH,
                                                "a+") as file:
                                            file.write("{}\n".format(
                                                a["href"]))
                                        logging.info(
                                            "rejected by remote server longer than {} minutes, "
                                            "and the failed url has been written in path {}"
                                            .format(
                                                config.
                                                NBD_MAX_REJECTED_AMOUNTS,
                                                config.
                                                RECORD_NBD_FAILED_URL_TXT_FILE_PATH
                                            ))
                                        break
                                    logging.info(
                                        "rejected by remote server, request {} again after "
                                        "{} seconds...".format(
                                            a["href"],
                                            60 * self.terminated_amount))
                                    time.sleep(60 * self.terminated_amount)
                                    result = self.get_url_info(a["href"])
                                date, article = result
                            self.is_article_prob = .5
                            if article != "":
                                data = {
                                    "Date": date,
                                    "PageId": page_url.split("/")[-1],
                                    "Url": a["href"],
                                    "Title": a.string,
                                    "Article": article
                                }
                                self.col.insert_one(data)
                                logging.info("[SUCCESS] {} {} {}".format(
                                    date, a.string, a["href"]))
 def get_realtime_news(self, url, category_chn=None, interval=60):
     logging.info(
         "start real-time crawling of URL -> {}, request every {} secs ... "
         .format(url, interval))
     assert category_chn is not None
     # TODO: 由于cnstock爬取的数据量并不大,这里暂时是抽取历史所有数据进行去重,之后会修改去重策略
     name_code_df = self.db_obj.get_data(
         config.STOCK_DATABASE_NAME,
         config.COLLECTION_NAME_STOCK_BASIC_INFO,
         keys=["name", "code"])
     name_code_dict = dict(name_code_df.values)
     crawled_urls = self.db_obj.get_data(self.db_name,
                                         self.col_name,
                                         keys=["Url"])["Url"].to_list()
     while True:
         # 每隔一定时间轮询该网址
         bs = utils.html_parser(url)
         for li in bs.find_all("li", attrs={"class": ["newslist"]}):
             a = li.find_all("h2")[0].find("a")
             if a["href"] not in crawled_urls:  # latest_3_days_crawled_href
                 result = self.get_url_info(a["href"])
                 while not result:
                     self.terminated_amount += 1
                     if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
                         # 始终无法爬取的URL保存起来
                         with open(
                                 config.
                                 RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH,
                                 "a+") as file:
                             file.write("{}\n".format(a["href"]))
                         logging.info(
                             "rejected by remote server longer than {} minutes, "
                             "and the failed url has been written in path {}"
                             .format(
                                 config.CNSTOCK_MAX_REJECTED_AMOUNTS,
                                 config.
                                 RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
                         break
                     logging.info(
                         "rejected by remote server, request {} again after "
                         "{} seconds...".format(a["href"], 60 *
                                                self.terminated_amount))
                     time.sleep(60 * self.terminated_amount)
                     result = self.get_url_info(a["href"])
                 if not result:
                     # 爬取失败的情况
                     logging.info("[FAILED] {} {}".format(
                         a["title"], a["href"]))
                 else:
                     # 有返回但是article为null的情况
                     date, article = result
                     while article == "" and self.is_article_prob >= .1:
                         self.is_article_prob -= .1
                         result = self.get_url_info(a["href"])
                         while not result:
                             self.terminated_amount += 1
                             if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
                                 # 始终无法爬取的URL保存起来
                                 with open(
                                         config.
                                         RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH,
                                         "a+") as file:
                                     file.write("{}\n".format(a["href"]))
                                 logging.info(
                                     "rejected by remote server longer than {} minutes, "
                                     "and the failed url has been written in path {}"
                                     .format(
                                         config.
                                         CNSTOCK_MAX_REJECTED_AMOUNTS,
                                         config.
                                         RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH
                                     ))
                                 break
                             logging.info(
                                 "rejected by remote server, request {} again after "
                                 "{} seconds...".format(
                                     a["href"],
                                     60 * self.terminated_amount))
                             time.sleep(60 * self.terminated_amount)
                             result = self.get_url_info(a["href"])
                         date, article = result
                     self.is_article_prob = .5
                     if article != "":
                         related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(
                             article, name_code_dict)
                         self.db_obj.insert_data(
                             self.db_name, self.col_name, {
                                 "Date":
                                 date,
                                 "Category":
                                 category_chn,
                                 "Url":
                                 a["href"],
                                 "Title":
                                 a["title"],
                                 "Article":
                                 article,
                                 "RelatedStockCodes":
                                 " ".join(related_stock_codes_list)
                             })
                         self.redis_client.lpush(
                             config.CACHE_NEWS_LIST_NAME,
                             json.dumps({
                                 "Date":
                                 date,
                                 "Category":
                                 category_chn,
                                 "Url":
                                 a["href"],
                                 "Title":
                                 a["title"],
                                 "Article":
                                 article,
                                 "RelatedStockCodes":
                                 " ".join(related_stock_codes_list),
                                 "OriDB":
                                 config.DATABASE_NAME,
                                 "OriCOL":
                                 config.COLLECTION_NAME_CNSTOCK
                             }))
                         logging.info("[SUCCESS] {} {} {}".format(
                             date, a["title"], a["href"]))
                         crawled_urls.append(a["href"])
         # logging.info("sleep {} secs then request {} again ... ".format(interval, url))
         time.sleep(interval)
    def get_historical_news(self, url, start_date=None, end_date=None):
        name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME,
                                            config.COLLECTION_NAME_STOCK_BASIC_INFO,
                                            keys=["name", "code"])
        name_code_dict = dict(name_code_df.values)

        crawled_urls_list = []
        if end_date is None:
            end_date = datetime.datetime.now().strftime("%Y-%m-%d")

        if start_date is None:
            # 如果start_date是None,则从历史数据库最新的日期补充爬取到最新日期
            # e.g. history_latest_date_str -> "2020-12-08"
            #      history_latest_date_dt -> datetime.date(2020, 12, 08)
            #      start_date -> "2020-12-09"
            history_latest_date_list = self.db_obj.get_data(self.db_name,
                                                            self.col_name,
                                                            keys=["Date"])["Date"].to_list()
            if len(history_latest_date_list) != 0:
                history_latest_date_str = max(history_latest_date_list).split(" ")[0]
                history_latest_date_dt = datetime.datetime.strptime(history_latest_date_str, "%Y-%m-%d").date()
                offset = datetime.timedelta(days=1)
                start_date = (history_latest_date_dt + offset).strftime('%Y-%m-%d')
            else:
                start_date = config.JRJ_REQUEST_DEFAULT_DATE

        dates_list = utils.get_date_list_from_range(start_date, end_date)
        dates_separated_into_ranges_list = utils.gen_dates_list(dates_list, config.JRJ_DATE_RANGE)

        for dates_range in dates_separated_into_ranges_list:
            for date in dates_range:
                first_url = "{}/{}/{}_1.shtml".format(url, date.replace("-", "")[0:6], date.replace("-", ""))
                max_pages_num = utils.search_max_pages_num(first_url, date)
                for num in range(1, max_pages_num + 1):
                    _url = "{}/{}/{}_{}.shtml".format(url, date.replace("-", "")[0:6], date.replace("-", ""), str(num))
                    bs = utils.html_parser(_url)
                    a_list = bs.find_all("a")
                    for a in a_list:
                        if "href" in a.attrs and a.string and \
                                a["href"].find("/{}/{}/".format(date.replace("-", "")[:4],
                                                                date.replace("-", "")[4:6])) != -1:
                            if a["href"] not in crawled_urls_list:
                                # 如果标题不包含"收盘","报于"等字样,即可写入数据库,因为包含这些字样标题的新闻多为机器自动生成
                                if a.string.find("收盘") == -1 and a.string.find("报于") == -1 and \
                                        a.string.find("新三板挂牌上市") == -1:
                                    result = self.get_url_info(a["href"], date)
                                    while not result:
                                        self.terminated_amount += 1
                                        if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
                                            # 始终无法爬取的URL保存起来
                                            with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file:
                                                file.write("{}\n".format(a["href"]))
                                            logging.info("rejected by remote server longer than {} minutes, "
                                                         "and the failed url has been written in path {}"
                                                         .format(config.JRJ_MAX_REJECTED_AMOUNTS,
                                                                 config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH))
                                            break
                                        logging.info("rejected by remote server, request {} again after "
                                                     "{} seconds...".format(a["href"], 60 * self.terminated_amount))
                                        time.sleep(60 * self.terminated_amount)
                                        result = self.get_url_info(a["href"], date)
                                    if not result:
                                        # 爬取失败的情况
                                        logging.info("[FAILED] {} {}".format(a.string, a["href"]))
                                    else:
                                        # 有返回但是article为null的情况
                                        article_specific_date, article = result
                                        while article == "" and self.is_article_prob >= .1:
                                            self.is_article_prob -= .1
                                            result = self.get_url_info(a["href"], date)
                                            while not result:
                                                self.terminated_amount += 1
                                                if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
                                                    # 始终无法爬取的URL保存起来
                                                    with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file:
                                                        file.write("{}\n".format(a["href"]))
                                                    logging.info("rejected by remote server longer than {} minutes, "
                                                                 "and the failed url has been written in path {}"
                                                                 .format(config.JRJ_MAX_REJECTED_AMOUNTS,
                                                                         config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH))
                                                    break
                                                logging.info("rejected by remote server, request {} again after "
                                                             "{} seconds...".format(a["href"],
                                                                                    60 * self.terminated_amount))
                                                time.sleep(60 * self.terminated_amount)
                                                result = self.get_url_info(a["href"], date)
                                            article_specific_date, article = result
                                        self.is_article_prob = .5
                                        if article != "":
                                                related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
                                                                                                                                  name_code_dict)
                                                data = {"Date": article_specific_date,
                                                        "Url": a["href"],
                                                        "Title": a.string,
                                                        "Article": article,
                                                        "RelatedStockCodes": " ".join(related_stock_codes_list)}
                                                # self.col.insert_one(data)
                                                self.db_obj.insert_data(self.db_name, self.col_name, data)
                                                logging.info("[SUCCESS] {} {} {}".format(article_specific_date,
                                                                                         a.string,
                                                                                         a["href"]))
                                    self.terminated_amount = 0  # 爬取结束后重置该参数
                                else:
                                    logging.info("[QUIT] {}".format(a.string))
 def get_realtime_news(self, interval=60):
     name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME,
                                         config.COLLECTION_NAME_STOCK_BASIC_INFO,
                                         keys=["name", "code"])
     name_code_dict = dict(name_code_df.values)
     crawled_urls_list = []
     is_change_date = False
     last_date = datetime.datetime.now().strftime("%Y-%m-%d")
     while True:
         today_date = datetime.datetime.now().strftime("%Y-%m-%d")
         if today_date != last_date:
             is_change_date = True
             last_date = today_date
         if is_change_date:
             crawled_urls_list = []
             is_change_date = False
         _url = "{}/{}/{}_1.shtml".format(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ,
                                          today_date.replace("-", "")[0:6],
                                          today_date.replace("-", ""))
         max_pages_num = utils.search_max_pages_num(_url, today_date)
         for num in range(1, max_pages_num + 1):
             _url = "{}/{}/{}_{}.shtml".format(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ,
                                               today_date.replace("-", "")[0:6],
                                               today_date.replace("-", ""),
                                               str(num))
             bs = utils.html_parser(_url)
             a_list = bs.find_all("a")
             for a in a_list:
                 if "href" in a.attrs and a.string and \
                         a["href"].find("/{}/{}/".format(today_date.replace("-", "")[:4],
                                                         today_date.replace("-", "")[4:6])) != -1:
                     if a["href"] not in crawled_urls_list:
                         # 如果标题不包含"收盘","报于"等字样,即可写入数据库,因为包含这些字样标题的新闻多为机器自动生成
                         if a.string.find("收盘") == -1 and a.string.find("报于") == -1 and \
                                 a.string.find("新三板挂牌上市") == -1:
                             result = self.get_url_info(a["href"], today_date)
                             while not result:
                                 self.terminated_amount += 1
                                 if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
                                     # 始终无法爬取的URL保存起来
                                     with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file:
                                         file.write("{}\n".format(a["href"]))
                                     logging.info("rejected by remote server longer than {} minutes, "
                                                  "and the failed url has been written in path {}"
                                                  .format(config.JRJ_MAX_REJECTED_AMOUNTS,
                                                          config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH))
                                     break
                                 logging.info("rejected by remote server, request {} again after "
                                              "{} seconds...".format(a["href"], 60 * self.terminated_amount))
                                 time.sleep(60 * self.terminated_amount)
                                 result = self.get_url_info(a["href"], today_date)
                             if not result:
                                 # 爬取失败的情况
                                 logging.info("[FAILED] {} {}".format(a.string, a["href"]))
                             else:
                                 # 有返回但是article为null的情况
                                 article_specific_date, article = result
                                 while article == "" and self.is_article_prob >= .1:
                                     self.is_article_prob -= .1
                                     result = self.get_url_info(a["href"], today_date)
                                     while not result:
                                         self.terminated_amount += 1
                                         if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
                                             # 始终无法爬取的URL保存起来
                                             with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file:
                                                 file.write("{}\n".format(a["href"]))
                                             logging.info("rejected by remote server longer than {} minutes, "
                                                          "and the failed url has been written in path {}"
                                                          .format(config.JRJ_MAX_REJECTED_AMOUNTS,
                                                                  config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH))
                                             break
                                         logging.info("rejected by remote server, request {} again after "
                                                      "{} seconds...".format(a["href"],
                                                                             60 * self.terminated_amount))
                                         time.sleep(60 * self.terminated_amount)
                                         result = self.get_url_info(a["href"], today_date)
                                     article_specific_date, article = result
                                 self.is_article_prob = .5
                                 if article != "":
                                     related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
                                                                                                                       name_code_dict)
                                     data = {"Date": article_specific_date,
                                             "Url": a["href"],
                                             "Title": a.string,
                                             "Article": article,
                                             "RelatedStockCodes": " ".join(related_stock_codes_list)}
                                     # self.col.insert_one(data)
                                     self.db_obj.insert_data(self.db_name, self.col_name, data)
                                     logging.info("[SUCCESS] {} {} {}".format(article_specific_date,
                                                                              a.string,
                                                                              a["href"]))
                             self.terminated_amount = 0  # 爬取结束后重置该参数
                         else:
                             logging.info("[QUIT] {}".format(a.string))
                         crawled_urls_list.append(a["href"])
         # logging.info("sleep {} secs then request again ... ".format(interval))
         time.sleep(interval)
 def get_realtime_news(self, url, category_chn=None, interval=60):
     logging.info("start real-time crawling of URL -> {} ... ".format(url))
     assert category_chn is not None
     # today_date = time.strftime("%Y-%m-%d", time.localtime(time.time()))
     # last_date = utils.get_date_before(1)
     # last_2_date = utils.get_date_before(2)
     # latest_3_days_crawled_href = self.db_obj.get_data(self.db_name,
     #                                                   self.col_name,
     #                                                   query={"Date": {"$regex": today_date}},
     #                                                   keys=["Url"])["Url"].to_list()
     # latest_3_days_crawled_href.extend(self.db_obj.get_data(self.db_name,
     #                                                        self.col_name,
     #                                                        query={"Date": {"$regex": last_date}},
     #                                                        keys=["Url"])["Url"].to_list())
     # latest_3_days_crawled_href.extend(self.db_obj.get_data(self.db_name,
     #                                                        self.col_name,
     #                                                        query={"Date": {"$regex": last_2_date}},
     #                                                        keys=["Url"])["Url"].to_list())
     crawled_urls = self.db_obj.get_data(self.db_name,
                                         self.col_name,
                                         keys=["Url"])["Url"].to_list()
     while True:
         # 每隔一定时间轮询该网址
         bs = utils.html_parser(url)
         for li in bs.find_all("li", attrs={"class": ["newslist"]}):
             a = li.find_all("h2")[0].find("a")
             if a["href"] not in crawled_urls:  # latest_3_days_crawled_href
                 result = self.get_url_info(a["href"])
                 while not result:
                     self.terminated_amount += 1
                     if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
                         # 始终无法爬取的URL保存起来
                         with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file:
                             file.write("{}\n".format(a["href"]))
                         logging.info("rejected by remote server longer than {} minutes, "
                                      "and the failed url has been written in path {}"
                                      .format(config.CNSTOCK_MAX_REJECTED_AMOUNTS,
                                              config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
                         break
                     logging.info("rejected by remote server, request {} again after "
                                  "{} seconds...".format(a["href"], 60 * self.terminated_amount))
                     time.sleep(60 * self.terminated_amount)
                     result = self.get_url_info(a["href"])
                 if not result:
                     # 爬取失败的情况
                     logging.info("[FAILED] {} {}".format(a["title"], a["href"]))
                 else:
                     # 有返回但是article为null的情况
                     date, article = result
                     while article == "" and self.is_article_prob >= .1:
                         self.is_article_prob -= .1
                         result = self.get_url_info(a["href"])
                         while not result:
                             self.terminated_amount += 1
                             if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
                                 # 始终无法爬取的URL保存起来
                                 with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file:
                                     file.write("{}\n".format(a["href"]))
                                 logging.info("rejected by remote server longer than {} minutes, "
                                              "and the failed url has been written in path {}"
                                              .format(config.CNSTOCK_MAX_REJECTED_AMOUNTS,
                                                      config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
                                 break
                             logging.info("rejected by remote server, request {} again after "
                                          "{} seconds...".format(a["href"], 60 * self.terminated_amount))
                             time.sleep(60 * self.terminated_amount)
                             result = self.get_url_info(a["href"])
                         date, article = result
                     self.is_article_prob = .5
                     if article != "":
                         data = {"Date": date,
                                 "Category": category_chn,
                                 "Url": a["href"],
                                 "Title": a["title"],
                                 "Article": article}
                         # self.col.insert_one(data)
                         self.db_obj.insert_data(self.db_name, self.col_name, data)
                         logging.info("[SUCCESS] {} {} {}".format(date, a["title"], a["href"]))
                         crawled_urls.append(a["href"])
         logging.info("sleep {} secs then request {} again ... ".format(interval, url))
         time.sleep(interval)