def crawler_FRB(): html = urlopen(url_frb_2016) bsObj = BeautifulSoup(html, "html.parser") events_list_obj = bsObj.find("div", { "class": "row eventlist" }).find("div", {"class": "col-xs-12 col-sm-8 col-md-8"}) event_rows_obj = events_list_obj.findAll("div", {"class": "row"}) # news_list = list() with open(base_dir + "csv_frb.csv", "a") as fw: csvwriter = csv.writer(fw) csvwriter.writerow(["title", "href", "date", "type", "content"]) for event_row_obj in event_rows_obj: try: news = News() date_obj = event_row_obj.find( "div", {"class": "col-xs-3 col-md-2 eventlist__time"}) news.date = date_obj.find("time").text event_obj = event_row_obj.find( "div", {"class": "col-xs-9 col-md-10 eventlist__event"}) news.href = url_domain_frb + event_obj.find("a").attrs['href'] news.title = event_obj.find("p").find("a").find("em").text news.type = event_obj.find("p", { "class": "eventlist__press" }).find("em").find("strong").text news.content = get_content(news.href) r = [news.title, news.href, news.date, news.type, news.content] csvwriter.writerow(r) # news_list.append(news) except: print("except..")
def crawler_PBOC(): with open(news_list_indexes_file, "r") as fr: with open(out_file, "w") as fw: csvwriter = csv.writer(fw) csvwriter.writerow(["title", "href", "date", "content"]) for index_url in fr.readlines(): # print(index_url) html = urlopen(index_url) # print(html) bsObj = BeautifulSoup(html, "lxml") print(bsObj) news_objs = bsObj.find("div", {"class":"mainw950"})\ .find("div", {"opentype":"page"}).find("td", {"colspan":"2"})\ .find("div", {"id":"r_con"}).find("div", {"class":"portlet"})\ .find("div", {"style":"height:480px"}).find("table").find("td").findAll("table") # print(news_objs) # return for news_obj in news_objs: try: news = News() news.date = news_obj.find("span", {"class": "hui12"}) news.href = url_domain_pboc + news_obj.find( "a").attrs['href'] news.title = news_obj.find("a").text news.content = getget_content(news.href) r = [news.title, news.href, news.date, news.content] csvwriter.writerow(r) except: print("except..")