コード例 #1
0
    def parse_engines_page(self, response: HtmlResponse):
        # We don't need last three links for about same reason listed above
        engines = response.xpath("//a[@class='btn btn--ghost border']"
                                 "/@href").getall()[:-3]

        yield from response.follow_all(engines,
                                       callback=self.parse_details_page)
コード例 #2
0
    def parse_generations_page(self, response: HtmlResponse):
        # Last two links are 'overview' and 'back' link, we don't need them
        generations = response.xpath("//a[@class='btn btn--ghost border']"
                                     "/@href").getall()[:-2]

        yield from response.follow_all(generations,
                                       callback=self.parse_engines_page)
コード例 #3
0
    def parse(self, response: HtmlResponse, **kwargs):
        """ 页面解析 """
        # 提取请求的列表种类(实习/校招)
        _job_type = search(r'type=([a-z]+)', response.url).group(1)
        # 存储详情页
        details_url: List[str] = response.xpath(
            '//a[@class="title ellipsis font"]/@href').getall()
        for u in details_url:
            yield response.follow(url=u,
                                  callback=self.parse_details,
                                  meta={'job_type': _job_type})

        # 获取当前页码
        current_page = search(pattern=r'page=(\d+)',
                              string=response.url).group(1)
        if current_page != '1':
            # 当前已经是终点页,直接返回
            return

        # 获取总页数
        max_page = int(
            response.xpath('//li[@class="number"][6]/text()')[0].get())
        # 直接并行爬取所有页面
        for r in response.follow_all(urls=[
                f'interns?page={p}&type={_job_type}'
                for p in range(2, max_page + 1)
        ],
                                     callback=self.parse):
            yield r
コード例 #4
0
    def parse(self, response: HtmlResponse):
        regions = response.xpath("/html/body//h3")
        for region_header in regions:
            region = BeautifulSoup(region_header.extract()).get_text()
            print(region)
            x = region_header.xpath("./following-sibling::ul[1]//li")

            yield from response.follow_all(
                region_header.xpath("./following-sibling::ul[1]//li//@href"),
                callback=self.parse_lyric,
                cb_kwargs={"region": region})
コード例 #5
0
    def parse(self, response: HtmlResponse, **kwargs):
        # 电影详情页
        if "subject" in response.url:
            movie = Movie()
            info = response.css("div#info")
            rating = response.css("div#interest_sectl")
            movie['title'] = response.css(
                "h1>span[property='v:itemreviewed']::text").get()
            movie['director'] = info.css(
                "a[rel='v:directedBy']::text").getall()
            movie['writer'] = info.css(
                "span.pl:contains('编剧')+span>a::text").getall()
            movie['starring'] = info.css("a[rel='v:starring']::text").getall()
            movie['genre'] = info.css(
                "span[property='v:genre']::text").getall()
            area = info.re('<span class="pl">制片国家/地区:</span>(.*?)<br>')
            movie['area'] = area[0].split("/") if area else None
            language = info.re('<span class="pl">语言:</span>(.*?)<br>')
            movie['language'] = language[0].split("/") if language else None
            movie['release_date'] = info.css(
                "span[property='v:initialReleaseDate']::text").getall()
            movie['runtime'] = info.css(
                "span[property='v:runtime']::text").getall()
            alias = info.re('<span class="pl">又名:</span>(.*?)<br>')
            movie['alias'] = alias[0].split("/") if alias else None
            movie['rating'] = rating.css(
                "strong[property='v:average']::text").get()
            movie['comments_num'] = rating.css(
                "span[property='v:votes']::text").get()
            movie['summary'] = response.css(
                "div#link-report > span[property='v:summary']::text").getall()
            yield movie
        else:
            # 生成电影详情页请求
            yield from response.follow_all(
                response.css("div.hd > a::attr(href)"),
                meta={'proxy': get_proxy()})

            # 生成下一页请求
            next_page = response.css("span.next > a::attr(href)")
            if next_page:
                yield response.follow(next_page[0],
                                      meta={'proxy': get_proxy()})
コード例 #6
0
 def parse_models_page(self, response: HtmlResponse):
     # Last link follows to 'overview' page, so we don't need it
     models = response.xpath("//a[@class='btn btn--ghost border']"
                             "/@href").getall()[:-1]
     yield from response.follow_all(models,
                                    callback=self.parse_generations_page)