def get_news_detail(self, response): item = PhilippinesItem() soup = bs(response.text, 'html.parser') title = soup.find("h1").text if soup.find("h1") else None pub_time_div = soup.find("div", class_="col-sm-6 cell-1") pub_time = Util.format_time4( pub_time_div.find("span", class_="date").get_text()) if pub_time_div.find( "span", class_="date") else None #January 13, 2021, 1:19 pm if soup.find("figure", class_="image align-right"): image_figure = soup.find("figure", class_="image align-right") image = image_figure.select_one("img").get("src") else: None body = '' p_list = soup.find("div", class_="page-content").select("p") for p in p_list[1:]: body += (p.text + '\n') abstract = '' category2 = '' item["title"] = title item["pub_time"] = pub_time item["images"] = image item["body"] = body item["abstract"] = abstract item["category2"] = category2 yield item
def get_news_detail(self, response): item = PhilippinesItem() soup = bs(response.text, 'html.parser') title = soup.find("h1", class_="tdb-title-text").text if soup.find( "h1", class_="tdb-title-text") else None pub_time = Util.format_time4( soup.find( "time", class_="entry-date updated td-module-date").text) if soup.find( "time", class_="entry-date updated td-module-date") else None body = '' p_list = soup.find_all("p") for p in p_list[:-5]: body += (p.text + '\n') image = '' abstract = '' category2 = '' item["title"] = title item["pub_time"] = pub_time item["images"] = image item["body"] = body item["abstract"] = abstract item["category2"] = category2 yield item
def get_news_url(self, response): soup = bs(response.text, 'html.parser') div_list = soup.find_all( "div", class_="tdb_module_loop td_module_wrap td-animation-stack") for div in div_list: news_url = div.find( "h3", class_="entry-title td-module-title").select_one( "a").get("href") yield scrapy.Request(news_url, callback=self.get_news_detail) div = soup.find("div", class_="page-nav td-pb-padding-side") a_list = div.select("a") temp_last_page = a_list[-2].text.split(",") last_page_num = int(temp_last_page[0] + temp_last_page[1]) temp = a_list[-1].get("href").rsplit("/", 3) first_page_url = temp[0] page_num = 1 if self.time == None or Util.format_time3( Util.format_time4( soup.select("time", class_="entry-date updated td-module-date"). get_text())) >= int(self.time): while page_num <= last_page_num: page_num += 1 next_url = first_page_url + "/page/" + str(page_num) + "/" if next_url: yield scrapy.Request(next_url, meta=response.meta, callback=self.get_news_url) else: self.logger.info('时间截止')
def get_news_url(self, response): soup = bs(response.text, 'html.parser') div = soup.find("div", class_="articles") div_list = div.find_all("div", class_="article media") for d in div_list: news_url = "http://www.pna.gov.ph" + d.select_one("a").get("href") yield scrapy.Request(news_url, callback=self.get_news_detail) if self.time == None or Util.format_time3( Util.format_time4(soup.select( "span", class_="date")).get_text()) >= int(self.time): nav = soup.find("nav", class_="pagination-area") li_list = nav.select("ul > li") next_url = "http://www.pna.gov.ph" + li_list[-2].select_one( "a").get("href") if li_list[-2].select_one("a").get( "href") else None #用'>'来进行下一页操作 if next_url: yield scrapy.Request(next_url, meta=response.meta, callback=self.get_news_url) else: self.logger.info('时间截止')