def chinatime_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q): Title = [] Publish_time = [] Section = [] Body = [] Source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") #print(b_time, "\n", e_time) loop_flag = False for page in range(1, 11): chinatime_home = "https://www.chinatimes.com/money/total?page=" + str(page) + "&chdtv" try: print("start collecting ChinaTime page..%s" % page) r = crawler_tool.url_retry(chinatime_home) soup = BeautifulSoup(r, "lxml") time.sleep(5) for i in range(len(soup.select("h3.title a"))): chinatime_financial_url = "https://www.chinatimes.com" + soup.select("h3.title a")[i]["href"] try: r2 = crawler_tool.url_retry(chinatime_financial_url) soup2 = BeautifulSoup(r2, "lxml") r_time = datetime.datetime.strptime(soup2.find("meta", attrs={"name": "pubdate"})["content"], "%Y-%m-%dT%H:%M:%S+08:00") if r_time > b_time: continue if r_time < e_time: print("Web Crawler has collected ChinaTime data from {b_time} to {e_time}".format(b_time= b_time, e_time= e_time)) loop_flag = True break else: Publish_time.append(r_time) Title.append(re.sub(r"\s{1,}","",soup2.find("h1").string)) Section.append(soup2.find("meta", attrs={"name": "section"})["content"]) Source.append(soup2.find("meta", attrs={"name": "source"})["content"]) body = soup2.select("div.article-body p") Body.append(crawler_tool.clean_html("".join(str(x) for x in body))) time.sleep(random.uniform(0, 2)) print("ChinaTime:", r_time) except rq.exceptions.RequestException as e: print("in", e) except rq.exceptions.RequestException as e: print("home", e) if loop_flag: break df = pd.DataFrame({"Title": Title, "Time": Publish_time, "Section": Section, "Source": Source, "Body": Body}).sort_values(by=["Time"]) file_name = "D:/User/Desktop/corpus/news/chinatime/" + decide_time_begin + "_" + decide_time_end + "_chinatime.csv" df.to_csv(file_name, encoding="utf-8") q.put(df)
def tvbs_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q): title = [] publish_time = [] body = [] section = [] source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") r = crawler_tool.url_retry("https://news.tvbs.com.tw/money") soup = BeautifulSoup(r, "lxml") for i in range(len(soup.select("div.content_center_contxt_box_news a"))): url = "https://news.tvbs.com.tw" + soup.select( "div.content_center_contxt_box_news a")[i]["href"] r2 = crawler_tool.url_retry(url) soup2 = BeautifulSoup(r2, "lxml") try: j = json.loads( soup2.find("script", attrs={ "type": "application/ld+json" }).string) r_time = datetime.datetime.strptime(j["datePublished"], "%Y/%m/%d %H:%M") if r_time > b_time: continue elif r_time < e_time: print( "Web Crawler has collected TVBS from {b_time} to {e_time}" .format(b_time=b_time, e_time=e_time)) break else: title.append( re.sub(r"\s{1, }", "", j["headline"].split("│")[0])) publish_time.append(r_time) body.append(j["articleBody"]) section.append(j["articleSection"]) source.append("TVBS新聞網") print("TVBS新聞網:", r_time) time.sleep(random.uniform(0.5, 1.5)) except json.decoder.JSONDecodeError: print("json.decoder.JSONDecodeError") df = pd.DataFrame({ "Title": title, "Time": publish_time, "Section": section, "Source": source, "Body": body }) file_name = "D:/User/Desktop/corpus/news/tvbs/" + decide_time_begin + "_" + decide_time_end + "_tvbs.csv" df.to_csv(file_name, encoding="utf-8") q.put(df)
def anue_GET_NEWS_time(decide_time_begin, decide_time_end): dt = datetime.datetime.today() - datetime.datetime.fromtimestamp( 1611763199) # 1/27差值 dta = (dt.days + 1) * 86400 + 1611763199 dtb = str(dta - 11 * 86400 + 1) dta = str(dta) title = [] publish_time = [] section = [] body = [] source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") loop_flag = False for page in range(1, 30): print("start collecting Anue page {page}".format(page=page)) home_url = "https://api.cnyes.com/media/api/v1/newslist/category/headline?limit=30&startAt="+dtb+"&endAt="+dta+"&page="\ + str(page) r = crawler_tool.url_retry_json(home_url) time.sleep(5) for i in range(len(r["items"]["data"])): content_url = "https://news.cnyes.com/news/id/" + str( r["items"]["data"][i]["newsId"]) r2 = crawler_tool.url_retry(content_url) soup = BeautifulSoup(r2, "lxml") try: r_time = datetime.datetime.strptime( soup.find("time").string, "%Y/%m/%d %H:%M") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "---Web Crawler has collected Anue data from {b_time} to {e_time}---" .format(b_time=b_time, e_time=e_time)) break else: section.append( soup.find("meta", attrs={ "property": "og:title" })["content"].split("|")[-1].split("-")[-1]) title.append( re.sub( r"\s{1,}", "", soup.find("meta", attrs={"property": "og:title" })["content"].split("|")[0])) source.append( soup.find("meta", attrs={ "property": "og:title" })["content"].split("|")[-1].split("-")[0]) publish_time.append(r_time) body.append( crawler_tool.clean_html("".join( str(x) for x in soup.select("div._2E8y p")))) print("Anue:", r_time) time.sleep(random.uniform(0, 1.5)) except: pass if loop_flag == True: break df = pd.DataFrame({ "Title": title, "Time": publish_time, "Section": section, "Source": source, "Body": body }) file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_Anue.csv" df.to_csv(file_name, encoding="utf-8")
def ctee_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q): Title = [] Publish_time = [] Section = [] Body = [] Source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") loop_flag = False for page in range(1, 11): print("start ctee collecting page {page}".format(page=page)) home_url = "https://m.ctee.com.tw/livenews/all/page/" + str(page) time.sleep(5) try: r = crawler_tool.url_retry(home_url) soup = BeautifulSoup(r, "lxml") for i in range(len(soup.select("p.now-title "))): content_url = soup.select("p.now-title ")[i].find_all( "a")[-1]["href"] section = soup.select("p.now-title ")[i].find("span").string r_time = datetime.datetime.strptime( str(b_time.year) + "/" + crawler_tool.clean_html( str(soup.select("p.now-title ")[i].find_all("a") [1]).split("|")[-1]), "%Y/ %m/%d %H:%M ") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "collected ctee news from {b_time} to {e_time}".format( b_time=b_time, e_time=e_time)) break else: r2 = crawler_tool.url_retry(content_url) soup2 = BeautifulSoup(r2, "lxml") if section == "生活" or section == "政治": time.sleep(random.uniform(0, 1.5)) continue else: Title.append(soup2.select("span.post-title")[0].string) Section.append(section) Source.append("工商時報") Publish_time.append(r_time) Body.append( crawler_tool.clean_html("".join( str(x) for x in soup2.select("div.entry-content p")))) print("ctee:", r_time) time.sleep(random.uniform(0, 1.5)) except rq.exceptions.RequestException as e2: print("home", e2) if loop_flag == True: break df = pd.DataFrame({ "Title": Title, "Time": Publish_time, "Section": Section, "Source": Source, "Body": Body }).sort_values(by=["Time"]) file_name = "D:/User/Desktop/corpus/news/ctee/" + decide_time_begin + "_" + decide_time_end + "_ctee.csv" df.to_csv(file_name, encoding="utf-8") q.put(df)
def rti_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q): title = [] publish_time = [] section = [] body = [] source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") #print(b_time, "\n", e_time) loop_flag = False for page in range(1, 100): print("start collecting Rti page {page}".format(page=page)) home_url = "https://www.rti.org.tw/news/list/categoryId/2/page/" + str( page) r = crawler_tool.url_retry(home_url) soup = BeautifulSoup(r, "lxml") time.sleep(5) for i in range(len(soup.select("div.main_wrapper ul a"))): content_url = "https://www.rti.org.tw" + soup.select( "div.main_wrapper ul a")[i]["href"] r2 = crawler_tool.url_retry(content_url) soup2 = BeautifulSoup(r2, "lxml") r_time = datetime.datetime.strptime( re.sub("[^0-9]", "", soup2.find("li", attrs={ "class": "date" }).string), "%Y%m%d%H%M") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "Web Crawler has collected Rti data from {b_time} to {e_time}" .format(b_time=b_time, e_time=e_time)) break else: section.append("財經") title.append( re.sub(r"\s{1,}", "", soup2.find("title").string.split("-")[0])) source.append(soup2.find("title").string.split("-")[-1]) publish_time.append(r_time) body.append( crawler_tool.clean_html("".join( str(x) for x in soup2.select("article p")))) print("Rti:", r_time) time.sleep(random.uniform(0, 2)) if loop_flag == True: break df = pd.DataFrame({ "Title": title, "Time": publish_time, "Section": section, "Source": source, "Body": body }) file_name = "D:/User/Desktop/corpus/news/rti/" + decide_time_begin + "_" + decide_time_end + "_rti.csv" df.to_csv(file_name, encoding="utf-8") q.put(df)