Beispiel #1
0
    def parse_list(self, response:scrapy.http.response.Response):
        art_links_selector = response.xpath('//*[@id="J_main-container"]//h2[@class="post-title"]/a')
        for art_link_selector in art_links_selector:
            link = art_link_selector.xpath('@href')
            title = art_link_selector.xpath('text()')

        # 首页的第二页按钮
        second_page = response.xpath('//*[@id="J_main-container"]'
                                     '//a[contains(@class, "home-browser-more-btn")]/@href').get()
        if second_page:
            yield response.follow(second_page, callback=self.parse_list)

        #
        next_page = response.xpath('//*[@id="J_main-container"]//ul[@class="pagination"]'
                       '/li[not(contains(@class, "disabled"))]/a[@aria-label="Next"]/@href').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse_list)
Beispiel #2
0
    def GetRateInfo(response: scrapy.http.response.Response):
        """
        解析评分信息
        :param response: scrapy返回的response
        :return: 评分人数,评分信息
        """
        rateNumber = int(
            response.xpath("//a[@class='rating_people']/span/text()").
            extract_first(default=0))

        rateDetails_dict = dict()
        for start_num in range(1, 6):
            rate = response.xpath(
                "//span[@class='stars{} starstop']/../span[@class='rating_per']/text()"
                .format(start_num)).extract_first(default="0")
            rateDetails_dict.update(
                {start_num: float(rate.strip('%')) / 100.0})
        return rateNumber, rateDetails_dict
Beispiel #3
0
 def GetMovieCountry(response: scrapy.http.response.Response):
     """
     解析制片国
     :param response: scrapy返回的response
     :return: 制片国家
     """
     return response.xpath(
         "//div[@id='info']/span[text()='制片国家/地区:'][1]/following-sibling::text()[1]"
     ).extract_first(default="").strip()
    def parse(self, response: scrapy.http.response.Response):
        next_page = response.xpath(
            '//div[@class="navigation-wrapper"]/div/a[@class="next"]/@href'
        ).get()
        if next_page:
            print(next_page)
            self.count += 1
            if self.count < 20:
                yield response.follow(next_page, callback=self.parse)

        desc = response.xpath('//meta[@name="description"]/@content').get()
        tags = response.xpath('//span[@class="tag-links"]/a/text()').getall()
        res = self.extractor.extract(response.text)
        yield MeituanArticleSpiderItem(url=response.url,
                                       title=res['title'],
                                       content=res['content'],
                                       tags=tags,
                                       author=res['author'],
                                       publish_time=res['publish_time'])
Beispiel #5
0
    def _parse_role(response: scrapy.http.response.Response) -> Dict[str, str]:
        """
        Extract Sphinx role from a crawled page.

        Valid roles:
            - function
            - class
            - module

        Args:
            response: PLACEHOLDER.

        Returns:
            String containing the role.

        """
        url = response.url

        name_query = "//h1/text()"
        name = response.xpath(name_query).get()

        if url in (
                "https://www.tensorflow.org/api_docs/python/tf",
                "https://www.tensorflow.org/probability/api_docs/python/tfp",
        ):
            return {"name": name, "url": url, "role": "package"}

        section_query = "//h2/text()"
        sections = response.xpath(section_query).getall()

        if "Module" in name.split(": "):
            role = "module"
            name = name.split(": ")[-1]
        elif "Attributes" in sections or "Methods" in sections:
            role = "class"
        else:
            # If the object is not a Module or a Class then it is a function.
            role = "function"

        return {"name": name, "url": url, "role": role}
    def parse(self, response: scrapy.http.response.Response) -> scrapy.Request:
        """
        Main scrapy parser

        :param response: scrapy response object
        :return: new scrapy request
        """
        for url in response.xpath(
                '//ul[@class="fl_titlelist"]/li/div[@class="fl_name"]/a/@href'
        ):
            url_val = url.extract()
            if url_val and url_val.strip('/') in self.already_harvested:
                continue
            else:
                yield scrapy.Request(
                    url=url_val,
                    callback=self.parse_item,
                    cb_kwargs={'on_netflix': '/netflix/' in response.url})
        next_url = response.xpath(
            '//li[@class="page-item"]/a[text() = "Next"]/@href').extract_first(
            )
        if next_url:
            yield scrapy.Request(url=next_url, callback=self.parse)
Beispiel #7
0
 def GetActorsInfo(response: scrapy.http.response.Response):
     """
     解析演员信息
     :param response: scrapy返回的response
     :return: 演员信息字典
     """
     try:
         actor_info_list = response.xpath("//span[@class='actor']//a")
         return {
             actor_info.xpath("text()").extract_first():
             actor_info.xpath("@href").extract_first()
             for actor_info in actor_info_list
         }
     except:
         return dict()
    def _parse_role(response: scrapy.http.response.Response) -> Dict[str, str]:
        """
        Extract Sphinx role from a crawled page.

        Valid roles:
            - function
            - class
            - module

        Args:
            response: PLACEHOLDER.

        Returns:
            String containing the role.

        """
        url = response.url

        if response.url == "https://www.tensorflow.org/api_docs/python/tf":
            return "package"

        name_query = "//h1/text()"
        name = response.xpath(name_query).get()

        class_selector = response.xpath("//h2/text()").get()

        if "Module" in name.split(": "):
            role = "module"
            name = name.split(": ")[-1]
        elif class_selector == "Class ":
            role = "class"
        else:
            # If the object is not a Module or a Class then it is a function.
            role = "function"

        return {"name": name, "url": url, "role": role}
Beispiel #9
0
 def GetDirectorOrAuthorInfo(key, response: scrapy.http.response.Response):
     """
     解析导演或编剧信息
     :param key: '导演'或'编剧'
     :param response: scrapy返回的response
     :return: 导演或编剧信息字典
     """
     try:
         info_list = response.xpath(
             "//div[@id='info']//span[text()='{key}']/following-sibling::span[1]/a"
             .format(key=key))
         return {
             info.xpath("text()").extract_first():
             info.xpath("@href").extract_first()
             for info in info_list
         }
     except:
         return dict()
 def parse(self, response: scrapy.http.response.Response, **kwargs):
     data_list = response.xpath('//*[@id="main-container"]/div[2]/ol/li')
     for data in data_list:
         item = CcspiderItem()
         item['title'] = data.xpath('.//p[1]/text()')[2].get().strip()
         item['authors'] = data.xpath('.//p[2]/a/text()').extract()
         date = utils.merge_text(data.xpath('.//p[4]/text()[last()]').get())
         date = date.split(' ')
         item['month'] = utils.month_to_int(date[0])
         item['year'] = int(date[1][:4])
         item['subjects'] = utils.deduplicate(
             data.xpath('.//div/div/span/@data-tooltip').extract())
         item['abstract'] = utils.merge_text(
             data.xpath('.//p[3]/span[3]/text()').get())
         item['citation'] = 0
         yield item
     print('已爬完{}页(共{}条)'.format(self.page, self.page * self.size))
     self.page += 1
    def parse_item(self, response: scrapy.http.response.Response,
                   on_netflix) -> CritickerMoviesItem:
        """
        Extract data from given item url

        :param response: scrapy response object
        :return: Criticker Movies item object
        """
        movie_data = CritickerMoviesItem()
        movie_data['on_netflix'] = int(on_netflix)
        movie_data['url'] = response.url.strip('/')
        movie_data['uid'] = self.extract_uid_from_url(movie_data['url'])
        movie_data['type'] = response.xpath(
            '//*[@id="fi_info_type"]/text()').extract_first()
        movie_data['name'] = response.xpath(
            '//h1/span[@itemprop="name"]/text()').extract_first()
        movie_data['date_published'] = response.xpath(
            '//h1/span[@itemprop="datePublished"]/text()').extract_first()
        movie_data['start_date'] = response.xpath(
            '//h1/span[@itemprop="startDate"]/text()').extract_first()
        movie_data['end_date'] = response.xpath(
            '//h1/span[@itemprop="endDate"]/text()').extract_first()
        movie_data['poster_url'] = response.xpath(
            '//div[@id="poster"]/img/@src').extract_first()
        movie_data['description'] = ' '.join([
            _.extract().strip()
            for _ in response.xpath('//span[@itemprop="description"]//text()')
        ]).strip()

        if not movie_data['description']:
            movie_data['description'] = None

        more_info_elem = response.xpath('//div[@id="fi_moreinfo"]')

        h = more_info_elem.xpath('./p')

        for i, hi in enumerate(h):
            hi_ = hi.attrib['id']
            label = self.extract_label_from_id(hi_)
            if 'aka' in label:
                movie_data[label] = response.xpath(
                    '//p[@id="{}"]/text()'.format(hi_)).extract_first()
            else:
                movie_data[label] = self.extract_more_info(hi)
        movie_data['trailer_url'] = response.xpath(
            '//div[@id="fi_trailer"]/iframe/@src').extract_first()
        if movie_data['trailer_url'] == 'http://www.youtube.com/watch?v=':
            movie_data['trailer_url'] = None
        movie_data['rss_feed_url'] = response.xpath(
            '//*[@id="fi_titlerss"]/a/@href').extract_first()
        movie_data['avg_percentile'] = response.xpath(
            '//span[@itemprop="ratingValue"]/text()').extract_first()
        movie_data['n_ratings'] = response.xpath(
            '//span[@itemprop="reviewCount"]/text()').extract_first()

        return movie_data