def get_url_info(self, url): try: bs = utils.html_parser(url) except Exception: return False span_list = bs.find_all("span") part = bs.find_all("p") article = "" date = "" for span in span_list: if "class" in span.attrs and span.text and span["class"] == [ "time" ]: string = span.text.split() for dt in string: if dt.find("-") != -1: date += dt + " " elif dt.find(":") != -1: date += dt break for paragraph in part: chn_status = utils.count_chn(str(paragraph)) possible = chn_status[1] if possible > self.is_article_prob: article += str(paragraph) while article.find("<") != -1 and article.find(">") != -1: string = article[article.find("<"):article.find(">") + 1] article = article.replace(string, "") while article.find("\u3000") != -1: article = article.replace("\u3000", "") article = " ".join(re.split(" +|\n+", article)).strip() return [date, article]
def get_url_info(self, url, specific_date): try: bs = utils.html_parser(url) except Exception: return False date = "" for span in bs.find_all("span"): if span.contents[0] == "jrj_final_date_start": date = span.text.replace("\r", "").replace("\n", "") break if date == "": date = specific_date article = "" for p in bs.find_all("p"): if not p.find_all("jrj_final_daohang_start") and p.attrs == {} and \ not p.find_all("input") and not p.find_all("a", attrs={"class": "red"}) and not p.find_all("i") and not p.find_all("span"): # if p.contents[0] != "jrj_final_daohang_start1" and p.attrs == {} and \ # not p.find_all("input") and not p.find_all("a", attrs={"class": "red"}) and not p.find_all("i"): article += p.text.replace("\r", "").replace("\n", "").replace("\u3000", "") return [date, article]
def get_historical_news(self, start_page=684): date_list = self.db_obj.get_data(self.db_name, self.col_name, keys=["Date"])["Date"].to_list() name_code_df = self.db_obj.get_data( config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO, keys=["name", "code"]) name_code_dict = dict(name_code_df.values) if len(date_list) == 0: # 说明没有历史数据,从头开始爬取 crawled_urls_list = [] page_urls = [ "{}/{}".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD, page_id) for page_id in range(start_page, 0, -1) ] for page_url in page_urls: bs = utils.html_parser(page_url) a_list = bs.find_all("a") for a in a_list: if "click-statistic" in a.attrs and a.string \ and a["click-statistic"].find("Article_") != -1 \ and a["href"].find("http://www.nbd.com.cn/articles/") != -1: if a["href"] not in crawled_urls_list: result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config.NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format( a.string, a["href"])) else: # 有返回但是article为null的情况 date, article = result while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format( a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config. NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) date, article = result self.is_article_prob = .5 if article != "": related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article( article, name_code_dict) data = { "Date": date, # "PageId": page_url.split("/")[-1], "Url": a["href"], "Title": a.string, "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list) } # self.col.insert_one(data) self.db_obj.insert_data( self.db_name, self.col_name, data) logging.info("[SUCCESS] {} {} {}".format( date, a.string, a["href"])) else: is_stop = False start_date = max(date_list) page_start_id = 1 while not is_stop: page_url = "{}/{}".format( config.WEBSITES_LIST_TO_BE_CRAWLED_NBD, page_start_id) bs = utils.html_parser(page_url) a_list = bs.find_all("a") for a in a_list: if "click-statistic" in a.attrs and a.string \ and a["click-statistic"].find("Article_") != -1 \ and a["href"].find("http://www.nbd.com.cn/articles/") != -1: result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config.NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH)) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format( a.string, a["href"])) else: # 有返回但是article为null的情况 date, article = result if date > start_date: while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format( a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config. NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) date, article = result self.is_article_prob = .5 if article != "": related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article( article, name_code_dict) data = { "Date": date, "Url": a["href"], "Title": a.string, "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list) } self.db_obj.insert_data( self.db_name, self.col_name, data) logging.info("[SUCCESS] {} {} {}".format( date, a.string, a["href"])) else: is_stop = True break if not is_stop: page_start_id += 1
def get_realtime_news(self, interval=60): page_url = "{}/1".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD) logging.info( "start real-time crawling of URL -> {}, request every {} secs ... " .format(page_url, interval)) name_code_df = self.db_obj.get_data( config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO, keys=["name", "code"]) name_code_dict = dict(name_code_df.values) crawled_urls = [] date_list = self.db_obj.get_data(self.db_name, self.col_name, keys=["Date"])["Date"].to_list() latest_date = max(date_list) while True: # 每隔一定时间轮询该网址 if len(crawled_urls) > 100: # 防止list过长,内存消耗大,维持list在100条 crawled_urls.pop(0) bs = utils.html_parser(page_url) a_list = bs.find_all("a") for a in a_list: if "click-statistic" in a.attrs and a.string \ and a["click-statistic"].find("Article_") != -1 \ and a["href"].find("http://www.nbd.com.cn/articles/") != -1: if a["href"] not in crawled_urls: result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config.NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH)) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format( a.string, a["href"])) else: # 有返回但是article为null的情况 date, article = result if date > latest_date: while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format( a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config. NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) date, article = result self.is_article_prob = .5 if article != "": related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article( article, name_code_dict) data = { "Date": date, # "PageId": page_url.split("/")[-1], "Url": a["href"], "Title": a.string, "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list) } # self.col.insert_one(data) self.db_obj.insert_data( self.db_name, self.col_name, data) crawled_urls.append(a["href"]) logging.info("[SUCCESS] {} {} {}".format( date, a.string, a["href"])) # logging.info("sleep {} secs then request again ... ".format(interval)) time.sleep(interval)
def get_realtime_news(self, interval=60): page_url = "{}/1".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD) logging.info( "start real-time crawling of URL -> {}, request every {} secs ... " .format(page_url, interval)) name_code_df = self.db_obj.get_data( config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO, keys=["name", "code"]) name_code_dict = dict(name_code_df.values) crawled_urls = [] date_list = self.db_obj.get_data(self.db_name, self.col_name, keys=["Date"])["Date"].to_list() latest_date = max(date_list) while True: # 每隔一定时间轮询该网址 if len(crawled_urls) > 100: # 防止list过长,内存消耗大,维持list在100条 crawled_urls.pop(0) bs = utils.html_parser(page_url) a_list = bs.find_all("a") for a in a_list: if "click-statistic" in a.attrs and a.string \ and a["click-statistic"].find("Article_") != -1 \ and a["href"].find("http://www.nbd.com.cn/articles/") != -1: if a["href"] not in crawled_urls: result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config.NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH)) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format( a.string, a["href"])) else: # 有返回但是article为null的情况 date, article = result if date > latest_date: while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format( a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config. NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) date, article = result self.is_article_prob = .5 if article != "": related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article( article, name_code_dict) self.db_obj.insert_data( self.db_name, self.col_name, { "Date": date, # "PageId": page_url.split("/")[-1], "Url": a["href"], "Title": a.string, "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list) }) self.redis_client.lpush( config.CACHE_NEWS_LIST_NAME, json.dumps({ "Date": date, # "PageId": page_url.split("/")[-1], "Url": a["href"], "Title": a.string, "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list), "OriDB": config.DATABASE_NAME, "OriCOL": config.COLLECTION_NAME_NBD })) crawled_urls.append(a["href"]) logging.info("[SUCCESS] {} {} {}".format( date, a.string, a["href"])) # logging.info("sleep {} secs then request again ... ".format(interval)) time.sleep(interval) # """ # Example-1: # 爬取历史新闻数据 # """ # if __name__ == "__main__": # nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD) # nbd_spyder.get_historical_news(start_page=684) # # Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run() # DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run() # """ # Example-2: # 爬取实时新闻数据 # """ # if __name__ == '__main__': # from Kite import config # # from Killua.denull import DeNull # from Killua.deduplication import Deduplication # # from Gon.nbdspyder import NbdSpyder # # # 如果没有历史数据从头爬取,如果已爬取历史数据,则从最新的时间开始爬取 # # 如历史数据中最近的新闻时间是"2020-12-09 20:37:10",则从该时间开始爬取 # nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD) # nbd_spyder.get_historical_news() # # Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run() # DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run() # # nbd_spyder.get_realtime_news()
def get_historical_news(self, url, start_date, end_date): # # 抽取数据库中已爬取的从start_date到latest_date_str所有新闻,避免重复爬取 # # 比如数据断断续续爬到了2016-10-10 15:00:00时间节点,但是在没调整参数的情 # # 况下,从2015-01-01(自己设定)开始重跑程序会导致大量重复数据,因此在这里稍 # # 作去重。直接从最新的时间节点开始跑是完全没问题,但从2015-01-01(自己设定) # # 开始重跑程序可以尝试将前面未成功爬取的URL重新再试一遍 # extracted_data_list = self.extract_data(["Date"])[0] # if len(extracted_data_list) != 0: # latest_date_str = max(extracted_data_list).split(" ")[0] # else: # latest_date_str = start_date # logging.info("latest time in database is {} ... ".format(latest_date_str)) # crawled_urls_list = list() # for _date in utils.get_date_list_from_range(start_date, latest_date_str): # query_results = self.query_news("Date", _date) # for qr in query_results: # crawled_urls_list.append(qr["Url"]) # # crawled_urls_list = self.extract_data(["Url"])[0] # abandoned # logging.info("the length of crawled data from {} to {} is {} ... ".format(start_date, # latest_date_str, # len(crawled_urls_list))) crawled_urls_list = [] dates_list = utils.get_date_list_from_range(start_date, end_date) dates_separated_into_ranges_list = utils.gen_dates_list( dates_list, config.JRJ_DATE_RANGE) for dates_range in dates_separated_into_ranges_list: for date in dates_range: first_url = "{}/{}/{}_1.shtml".format( url, date.replace("-", "")[0:6], date.replace("-", "")) max_pages_num = utils.search_max_pages_num(first_url, date) for num in range(1, max_pages_num + 1): _url = "{}/{}/{}_{}.shtml".format( url, date.replace("-", "")[0:6], date.replace("-", ""), str(num)) bs = utils.html_parser(_url) a_list = bs.find_all("a") for a in a_list: if "href" in a.attrs and a.string and \ a["href"].find("/{}/{}/".format(date.replace("-", "")[:4], date.replace("-", "")[4:6])) != -1: if a["href"] not in crawled_urls_list: # 如果标题不包含"收盘","报于"等字样,即可写入数据库,因为包含这些字样标题的新闻多为机器自动生成 if a.string.find("收盘") == -1 and a.string.find("报于") == -1 and \ a.string.find("新三板挂牌上市") == -1: result = self.get_url_info(a["href"], date) while not result: self.terminated_amount += 1 if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format( a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config. JRJ_MAX_REJECTED_AMOUNTS, config. RECORD_JRJ_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info( a["href"], date) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format( a.string, a["href"])) else: # 有返回但是article为null的情况 article_specific_date, article = result while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info( a["href"], date) while not result: self.terminated_amount += 1 if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write( "{}\n".format( a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config. JRJ_MAX_REJECTED_AMOUNTS, config. RECORD_JRJ_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount) ) time.sleep( 60 * self.terminated_amount) result = self.get_url_info( a["href"], date) article_specific_date, article = result self.is_article_prob = .5 if article != "": data = { "Date": article_specific_date, "Url": a["href"], "Title": a.string, "Article": article } # self.col.insert_one(data) self.db_obj.insert_data( self.db_name, self.col_name, data) logging.info( "[SUCCESS] {} {} {}".format( article_specific_date, a.string, a["href"])) self.terminated_amount = 0 # 爬取结束后重置该参数 else: logging.info("[QUIT] {}".format(a.string))
def get_historical_news(self, start_page): extracted_data_list = self.extract_data(["PageId"])[0] if len(extracted_data_list) != 0: latest_page_id = min(extracted_data_list) else: latest_page_id = start_page crawled_urls_list = list() for page_id in range(start_page, int(latest_page_id) - 1, -1): query_results = self.query_news("PageId", page_id) for qr in query_results: crawled_urls_list.append(qr["Url"]) # crawled_urls_list = self.extract_data(["Url"])[0] # abandoned logging.info( "the length of crawled data from page {} to page {} is {} ... ". format(start_page, latest_page_id, len(crawled_urls_list))) page_urls = [ "{}/{}".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD, page_id) for page_id in range(start_page, 0, -1) ] for page_url in page_urls: bs = utils.html_parser(page_url) a_list = bs.find_all("a") for a in a_list: if "click-statistic" in a.attrs and a.string \ and a["click-statistic"].find("Article_") != -1 \ and a["href"].find("http://www.nbd.com.cn/articles/") != -1: if a["href"] not in crawled_urls_list: result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config.NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH)) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format( a.string, a["href"])) else: # 有返回但是article为null的情况 date, article = result while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format( a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config. NBD_MAX_REJECTED_AMOUNTS, config. RECORD_NBD_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) date, article = result self.is_article_prob = .5 if article != "": data = { "Date": date, "PageId": page_url.split("/")[-1], "Url": a["href"], "Title": a.string, "Article": article } self.col.insert_one(data) logging.info("[SUCCESS] {} {} {}".format( date, a.string, a["href"]))
def get_realtime_news(self, url, category_chn=None, interval=60): logging.info( "start real-time crawling of URL -> {}, request every {} secs ... " .format(url, interval)) assert category_chn is not None # TODO: 由于cnstock爬取的数据量并不大,这里暂时是抽取历史所有数据进行去重,之后会修改去重策略 name_code_df = self.db_obj.get_data( config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO, keys=["name", "code"]) name_code_dict = dict(name_code_df.values) crawled_urls = self.db_obj.get_data(self.db_name, self.col_name, keys=["Url"])["Url"].to_list() while True: # 每隔一定时间轮询该网址 bs = utils.html_parser(url) for li in bs.find_all("li", attrs={"class": ["newslist"]}): a = li.find_all("h2")[0].find("a") if a["href"] not in crawled_urls: # latest_3_days_crawled_href result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config.CNSTOCK_MAX_REJECTED_AMOUNTS, config. RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH)) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format(a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format( a["title"], a["href"])) else: # 有返回但是article为null的情况 date, article = result while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open( config. RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info( "rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format( config. CNSTOCK_MAX_REJECTED_AMOUNTS, config. RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH )) break logging.info( "rejected by remote server, request {} again after " "{} seconds...".format( a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) date, article = result self.is_article_prob = .5 if article != "": related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article( article, name_code_dict) self.db_obj.insert_data( self.db_name, self.col_name, { "Date": date, "Category": category_chn, "Url": a["href"], "Title": a["title"], "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list) }) self.redis_client.lpush( config.CACHE_NEWS_LIST_NAME, json.dumps({ "Date": date, "Category": category_chn, "Url": a["href"], "Title": a["title"], "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list), "OriDB": config.DATABASE_NAME, "OriCOL": config.COLLECTION_NAME_CNSTOCK })) logging.info("[SUCCESS] {} {} {}".format( date, a["title"], a["href"])) crawled_urls.append(a["href"]) # logging.info("sleep {} secs then request {} again ... ".format(interval, url)) time.sleep(interval)
def get_historical_news(self, url, start_date=None, end_date=None): name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO, keys=["name", "code"]) name_code_dict = dict(name_code_df.values) crawled_urls_list = [] if end_date is None: end_date = datetime.datetime.now().strftime("%Y-%m-%d") if start_date is None: # 如果start_date是None,则从历史数据库最新的日期补充爬取到最新日期 # e.g. history_latest_date_str -> "2020-12-08" # history_latest_date_dt -> datetime.date(2020, 12, 08) # start_date -> "2020-12-09" history_latest_date_list = self.db_obj.get_data(self.db_name, self.col_name, keys=["Date"])["Date"].to_list() if len(history_latest_date_list) != 0: history_latest_date_str = max(history_latest_date_list).split(" ")[0] history_latest_date_dt = datetime.datetime.strptime(history_latest_date_str, "%Y-%m-%d").date() offset = datetime.timedelta(days=1) start_date = (history_latest_date_dt + offset).strftime('%Y-%m-%d') else: start_date = config.JRJ_REQUEST_DEFAULT_DATE dates_list = utils.get_date_list_from_range(start_date, end_date) dates_separated_into_ranges_list = utils.gen_dates_list(dates_list, config.JRJ_DATE_RANGE) for dates_range in dates_separated_into_ranges_list: for date in dates_range: first_url = "{}/{}/{}_1.shtml".format(url, date.replace("-", "")[0:6], date.replace("-", "")) max_pages_num = utils.search_max_pages_num(first_url, date) for num in range(1, max_pages_num + 1): _url = "{}/{}/{}_{}.shtml".format(url, date.replace("-", "")[0:6], date.replace("-", ""), str(num)) bs = utils.html_parser(_url) a_list = bs.find_all("a") for a in a_list: if "href" in a.attrs and a.string and \ a["href"].find("/{}/{}/".format(date.replace("-", "")[:4], date.replace("-", "")[4:6])) != -1: if a["href"] not in crawled_urls_list: # 如果标题不包含"收盘","报于"等字样,即可写入数据库,因为包含这些字样标题的新闻多为机器自动生成 if a.string.find("收盘") == -1 and a.string.find("报于") == -1 and \ a.string.find("新三板挂牌上市") == -1: result = self.get_url_info(a["href"], date) while not result: self.terminated_amount += 1 if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info("rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format(config.JRJ_MAX_REJECTED_AMOUNTS, config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH)) break logging.info("rejected by remote server, request {} again after " "{} seconds...".format(a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"], date) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format(a.string, a["href"])) else: # 有返回但是article为null的情况 article_specific_date, article = result while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"], date) while not result: self.terminated_amount += 1 if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info("rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format(config.JRJ_MAX_REJECTED_AMOUNTS, config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH)) break logging.info("rejected by remote server, request {} again after " "{} seconds...".format(a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"], date) article_specific_date, article = result self.is_article_prob = .5 if article != "": related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article, name_code_dict) data = {"Date": article_specific_date, "Url": a["href"], "Title": a.string, "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list)} # self.col.insert_one(data) self.db_obj.insert_data(self.db_name, self.col_name, data) logging.info("[SUCCESS] {} {} {}".format(article_specific_date, a.string, a["href"])) self.terminated_amount = 0 # 爬取结束后重置该参数 else: logging.info("[QUIT] {}".format(a.string))
def get_realtime_news(self, interval=60): name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO, keys=["name", "code"]) name_code_dict = dict(name_code_df.values) crawled_urls_list = [] is_change_date = False last_date = datetime.datetime.now().strftime("%Y-%m-%d") while True: today_date = datetime.datetime.now().strftime("%Y-%m-%d") if today_date != last_date: is_change_date = True last_date = today_date if is_change_date: crawled_urls_list = [] is_change_date = False _url = "{}/{}/{}_1.shtml".format(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ, today_date.replace("-", "")[0:6], today_date.replace("-", "")) max_pages_num = utils.search_max_pages_num(_url, today_date) for num in range(1, max_pages_num + 1): _url = "{}/{}/{}_{}.shtml".format(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ, today_date.replace("-", "")[0:6], today_date.replace("-", ""), str(num)) bs = utils.html_parser(_url) a_list = bs.find_all("a") for a in a_list: if "href" in a.attrs and a.string and \ a["href"].find("/{}/{}/".format(today_date.replace("-", "")[:4], today_date.replace("-", "")[4:6])) != -1: if a["href"] not in crawled_urls_list: # 如果标题不包含"收盘","报于"等字样,即可写入数据库,因为包含这些字样标题的新闻多为机器自动生成 if a.string.find("收盘") == -1 and a.string.find("报于") == -1 and \ a.string.find("新三板挂牌上市") == -1: result = self.get_url_info(a["href"], today_date) while not result: self.terminated_amount += 1 if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info("rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format(config.JRJ_MAX_REJECTED_AMOUNTS, config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH)) break logging.info("rejected by remote server, request {} again after " "{} seconds...".format(a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"], today_date) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format(a.string, a["href"])) else: # 有返回但是article为null的情况 article_specific_date, article = result while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"], today_date) while not result: self.terminated_amount += 1 if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info("rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format(config.JRJ_MAX_REJECTED_AMOUNTS, config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH)) break logging.info("rejected by remote server, request {} again after " "{} seconds...".format(a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"], today_date) article_specific_date, article = result self.is_article_prob = .5 if article != "": related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article, name_code_dict) data = {"Date": article_specific_date, "Url": a["href"], "Title": a.string, "Article": article, "RelatedStockCodes": " ".join(related_stock_codes_list)} # self.col.insert_one(data) self.db_obj.insert_data(self.db_name, self.col_name, data) logging.info("[SUCCESS] {} {} {}".format(article_specific_date, a.string, a["href"])) self.terminated_amount = 0 # 爬取结束后重置该参数 else: logging.info("[QUIT] {}".format(a.string)) crawled_urls_list.append(a["href"]) # logging.info("sleep {} secs then request again ... ".format(interval)) time.sleep(interval)
def get_realtime_news(self, url, category_chn=None, interval=60): logging.info("start real-time crawling of URL -> {} ... ".format(url)) assert category_chn is not None # today_date = time.strftime("%Y-%m-%d", time.localtime(time.time())) # last_date = utils.get_date_before(1) # last_2_date = utils.get_date_before(2) # latest_3_days_crawled_href = self.db_obj.get_data(self.db_name, # self.col_name, # query={"Date": {"$regex": today_date}}, # keys=["Url"])["Url"].to_list() # latest_3_days_crawled_href.extend(self.db_obj.get_data(self.db_name, # self.col_name, # query={"Date": {"$regex": last_date}}, # keys=["Url"])["Url"].to_list()) # latest_3_days_crawled_href.extend(self.db_obj.get_data(self.db_name, # self.col_name, # query={"Date": {"$regex": last_2_date}}, # keys=["Url"])["Url"].to_list()) crawled_urls = self.db_obj.get_data(self.db_name, self.col_name, keys=["Url"])["Url"].to_list() while True: # 每隔一定时间轮询该网址 bs = utils.html_parser(url) for li in bs.find_all("li", attrs={"class": ["newslist"]}): a = li.find_all("h2")[0].find("a") if a["href"] not in crawled_urls: # latest_3_days_crawled_href result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info("rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format(config.CNSTOCK_MAX_REJECTED_AMOUNTS, config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH)) break logging.info("rejected by remote server, request {} again after " "{} seconds...".format(a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) if not result: # 爬取失败的情况 logging.info("[FAILED] {} {}".format(a["title"], a["href"])) else: # 有返回但是article为null的情况 date, article = result while article == "" and self.is_article_prob >= .1: self.is_article_prob -= .1 result = self.get_url_info(a["href"]) while not result: self.terminated_amount += 1 if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS: # 始终无法爬取的URL保存起来 with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file: file.write("{}\n".format(a["href"])) logging.info("rejected by remote server longer than {} minutes, " "and the failed url has been written in path {}" .format(config.CNSTOCK_MAX_REJECTED_AMOUNTS, config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH)) break logging.info("rejected by remote server, request {} again after " "{} seconds...".format(a["href"], 60 * self.terminated_amount)) time.sleep(60 * self.terminated_amount) result = self.get_url_info(a["href"]) date, article = result self.is_article_prob = .5 if article != "": data = {"Date": date, "Category": category_chn, "Url": a["href"], "Title": a["title"], "Article": article} # self.col.insert_one(data) self.db_obj.insert_data(self.db_name, self.col_name, data) logging.info("[SUCCESS] {} {} {}".format(date, a["title"], a["href"])) crawled_urls.append(a["href"]) logging.info("sleep {} secs then request {} again ... ".format(interval, url)) time.sleep(interval)