def parse_engines_page(self, response: HtmlResponse): # We don't need last three links for about same reason listed above engines = response.xpath("//a[@class='btn btn--ghost border']" "/@href").getall()[:-3] yield from response.follow_all(engines, callback=self.parse_details_page)
def parse_generations_page(self, response: HtmlResponse): # Last two links are 'overview' and 'back' link, we don't need them generations = response.xpath("//a[@class='btn btn--ghost border']" "/@href").getall()[:-2] yield from response.follow_all(generations, callback=self.parse_engines_page)
def parse(self, response: HtmlResponse, **kwargs): """ 页面解析 """ # 提取请求的列表种类(实习/校招) _job_type = search(r'type=([a-z]+)', response.url).group(1) # 存储详情页 details_url: List[str] = response.xpath( '//a[@class="title ellipsis font"]/@href').getall() for u in details_url: yield response.follow(url=u, callback=self.parse_details, meta={'job_type': _job_type}) # 获取当前页码 current_page = search(pattern=r'page=(\d+)', string=response.url).group(1) if current_page != '1': # 当前已经是终点页,直接返回 return # 获取总页数 max_page = int( response.xpath('//li[@class="number"][6]/text()')[0].get()) # 直接并行爬取所有页面 for r in response.follow_all(urls=[ f'interns?page={p}&type={_job_type}' for p in range(2, max_page + 1) ], callback=self.parse): yield r
def parse(self, response: HtmlResponse): regions = response.xpath("/html/body//h3") for region_header in regions: region = BeautifulSoup(region_header.extract()).get_text() print(region) x = region_header.xpath("./following-sibling::ul[1]//li") yield from response.follow_all( region_header.xpath("./following-sibling::ul[1]//li//@href"), callback=self.parse_lyric, cb_kwargs={"region": region})
def parse(self, response: HtmlResponse, **kwargs): # 电影详情页 if "subject" in response.url: movie = Movie() info = response.css("div#info") rating = response.css("div#interest_sectl") movie['title'] = response.css( "h1>span[property='v:itemreviewed']::text").get() movie['director'] = info.css( "a[rel='v:directedBy']::text").getall() movie['writer'] = info.css( "span.pl:contains('编剧')+span>a::text").getall() movie['starring'] = info.css("a[rel='v:starring']::text").getall() movie['genre'] = info.css( "span[property='v:genre']::text").getall() area = info.re('<span class="pl">制片国家/地区:</span>(.*?)<br>') movie['area'] = area[0].split("/") if area else None language = info.re('<span class="pl">语言:</span>(.*?)<br>') movie['language'] = language[0].split("/") if language else None movie['release_date'] = info.css( "span[property='v:initialReleaseDate']::text").getall() movie['runtime'] = info.css( "span[property='v:runtime']::text").getall() alias = info.re('<span class="pl">又名:</span>(.*?)<br>') movie['alias'] = alias[0].split("/") if alias else None movie['rating'] = rating.css( "strong[property='v:average']::text").get() movie['comments_num'] = rating.css( "span[property='v:votes']::text").get() movie['summary'] = response.css( "div#link-report > span[property='v:summary']::text").getall() yield movie else: # 生成电影详情页请求 yield from response.follow_all( response.css("div.hd > a::attr(href)"), meta={'proxy': get_proxy()}) # 生成下一页请求 next_page = response.css("span.next > a::attr(href)") if next_page: yield response.follow(next_page[0], meta={'proxy': get_proxy()})
def parse_models_page(self, response: HtmlResponse): # Last link follows to 'overview' page, so we don't need it models = response.xpath("//a[@class='btn btn--ghost border']" "/@href").getall()[:-1] yield from response.follow_all(models, callback=self.parse_generations_page)