Esempio n. 1
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value('country', 'middle east')
     article.add_value('language', 'english')
     article.add_value('stopwords', ['al', 'jazeera'])
     article.nested_css('div.article-body').add_xpath('body', './p//text()')
     article.nested_css('meta[property="og:title"]').add_xpath(
         'headline', './@content')
     # Function to parse published time to iso6801
     published_time_in = Compose(
         Join(), lambda v: '' if (datetime.strptime(
             v, '%a, %d %B %Y %H:%M:%S %Z') is None) else datetime.strptime(
                 v, '%a, %d %B %Y %H:%M:%S %Z').isoformat(sep='T'))
     article.nested_css('meta[name="LastModifiedDate"]').add_xpath(
         'published_time',
         './@content',
         published_time_in,
     )
     article.nested_css('span.article-topics').add_xpath(
         'category', './/text()')
     article.nested_css('meta[property="ContentType"]').add_xpath(
         'category', './@content')
     article.add_value('url', response.url)
     article.add_value('encoding', response.encoding)
     return article.load_item()
    def parse_recipe(self, response):
        loader = ItemLoader(item=RecipesItem(), selector=response)
        loader.add_css('name', css='div.container h1.main-title span ::text')
        loader.add_css('image',
                       css='div.product-area div.cuisine-page img::attr(src)')
        loader.add_css('proportion',
                       css='div.product-area div.people div.number ::text')
        loader.add_css(
            'time_prep',
            css='div.product-area div.columns :nth-child(3) div.number ::text')
        loader.add_css(
            'time_cook',
            css='div.product-area div.columns :last-child div.number ::text')
        loader.add_css('category_name',
                       css='div.main-content li:nth-child(3) a span ::text')
        loader.add_css('desc', css='div.product-area div.description p::text')
        ingredient_loader = loader.nested_css(
            'div.ingredient-area div.ingredient li')
        ingredient_loader.add_css('ingredients', css='::text')

        # Uncomment if we decided to show the recipe
        prepa_loader = loader.nested_css('div.preparation li')
        prepa_loader.add_css('preparation', css='div > p ::text')

        area_loader = loader.nested_css('div.name-area div.item')
        area_loader.add_css('areas', css='div.title a ::text')
        # prepa_url = response.css('div.recipe-info div.action a ::attr(href)')
        yield loader.load_item()
Esempio n. 3
0
  def parse_single(response):
    loader = ItemLoader(item=PageItem(), response=response)

    loader.add_value('url', response.url)

    loader.add_css('title', 'div.article-title')
    loader.add_css('vintage', 'div.article-title')
    loader.add_css('rating', '#points')
    loader.add_css('description', 'p.description')

    primary_info_loader = loader.nested_css('ul.primary-info')
    appellation_loader = primary_info_loader.nested_css('li.row:nth-last-child(2) div.info')

    primary_info_loader.add_css('price', 'li.row:nth-child(1) div.info')
    if len(primary_info_loader.selector.css('li.row')) == 5:
      primary_info_loader.add_css('designation', 'li.row:nth-child(2) div.info')
    primary_info_loader.add_css('varietal', 'li.row:nth-last-child(3) div.info')

    appellation_loader.add_css('subsubregion', 'span a:nth-last-child(4)')
    appellation_loader.add_css('subregion', 'span a:nth-last-child(3)')
    appellation_loader.add_css('region', 'span a:nth-last-child(2)')
    appellation_loader.add_css('country', 'span a:nth-last-child(1)')

    primary_info_loader.add_css('winery', 'li.row:nth-last-child(1) div.info')

    secondary_info_loader = loader.nested_css('ul.secondary-info')

    secondary_info_loader.add_css('alcohol', 'li.row:nth-child(1) div.info')
    secondary_info_loader.add_css('category', 'li.row:nth-child(3) div.info')

    yield loader.load_item()
 def parse(self, response):
     loader = ItemLoader(item=RecipeItem(), selector=response)
     loader.add_css('recipe_name',
                    css='div.container h1.main-title span ::text')
     loader.add_css(
         'recipe_image',
         css='div.container div.product-area div.image ::attr(src)')
     ingredient_loader = loader.nested_css('div.ingredient li')
     ingredient_loader.add_css('recipe_ingredients', css='::text')
     prepa_loader = loader.nested_css('div.preparation li')
     prepa_loader.add_css('recipe_prepa', css='div > p ::text')
     yield loader.load_item()
Esempio n. 5
0
    def parse_rent(self, response: HtmlResponse) -> Iterator[LyEstateRentItem]:
        loader = ItemLoader(item=LyEstateRentItem(), selector=response)

        loader.add_css('title', 'h1::text')

        info_loader = loader.nested_css('.dtl_frinfo')
        info_loader.add_css('per_month_price', 'tr:nth-child(1) b::text')
        info_loader.add_css('area',
                            'tr:nth-child(2) td:nth-child(1)::text',
                            re='\d+')
        info_loader.add_css('style',
                            'tr:nth-child(2) td:nth-child(2)::text',
                            re=self._INFO_PATTERN)
        info_loader.add_css('direction',
                            'tr:nth-child(3) td:nth-child(1)::text',
                            re=self._INFO_PATTERN)
        info_loader.add_css('floor',
                            'tr:nth-child(3) td:nth-child(2)::text',
                            re=self._INFO_PATTERN)
        info_loader.add_css('house_type',
                            'tr:nth-child(4) td:nth-child(1) a::text')
        info_loader.add_css('decoration',
                            'tr:nth-child(4) td:nth-child(2)::text',
                            re=self._INFO_PATTERN)
        info_loader.add_css('age',
                            'tr:nth-child(5) td::text',
                            re=self._INFO_PATTERN)
        info_loader.add_css('address',
                            'tr:nth-child(7) td::text',
                            re=self._INFO_PATTERN)

        loader.add_css('description', '.dtl_content *::text')
        loader.add_css('images', '.dtl_pics img::attr("src")')

        yield loader.load_item()
    def parse_forum_post(self, response: HtmlResponse):
        for floor, post_block in enumerate(
                response.css('#postlist div.postaaa'), start=1):
            # =============== 解析用户数据 =======================
            user_block = post_block.css('.favatar')
            username = user_block.css('.xw1::text').extract_first()
            user_loader = ItemLoader(item=LyCommunityUserItem(),
                                     selector=user_block,
                                     base_url='http://www.lysq.com/')
            user_loader.add_value('username', username)
            user_loader.add_css('avatar_url', '.avtm img::attr("src")')
            user_loader.add_css('medal_list', 'p.md_ctrl img::attr("alt")')
            user_loader.add_css('coin_count', 'dl.pil dd::text', re='\d+')
            user_loader.add_css(
                'user_group',
                'a[href^="home.php?mod=spacecp&ac=usergroup"]::text')
            user_loader.add_css('signature', 'p.xg1::text')

            user_data_loader = user_loader.nested_css('div.tns')
            user_data_loader.add_css('topic_count', 'th:nth-child(1) a::text')
            user_data_loader.add_css('post_count', 'th:nth-child(2) a::text')
            user_data_loader.add_css('credit_count', 'td span::attr("title")')
            user_data_loader.add_css('credit_count', 'td a::text')

            yield user_loader.load_item()

            # =============== 解析post comment =================
            comment_block = post_block.css('td.plc')
            content = comment_block.css('.pcb .t_fsz').extract_first()
            comment_loader = ItemLoader(item=LyCommunityCommentItem(),
                                        selector=comment_block,
                                        base_url='http://www.lysq.com/')
            comment_loader.add_value('post_url', response.meta['post_url'])
            comment_loader.add_value('author_username', username)
            comment_loader.add_value('content', content)
            comment_loader.add_value('image_urls', content, re=RE_IMG_SRC)

            comment_header_loader = comment_loader.nested_css('div.pi')
            comment_header_loader.add_css('floor', 'strong em::text')
            comment_header_loader.add_value('floor',
                                            response.meta['page'] * 10 + floor)
            comment_header_loader.add_css('created_time',
                                          'div.authi em::text',
                                          re=RE_DATETIME)
            yield comment_loader.load_item()
Esempio n. 7
0
    def parse_actress_detail_cn(self, response: HtmlResponse):
        loader = ItemLoader(item=AvmooActressItem(), selector=response)

        avatar_box = loader.nested_css('.avatar-box')
        avatar_box.add_css('avatar', 'img::attr(src)')
        avatar_box.add_css('name_cn', 'span::text')
        avatar_box.add_css('info', 'p::text')

        yield response.follow(response.url.replace('/cn/', '/en/'),
                              callback=self.parse_actress_detail_en,
                              meta={'item': loader.load_item()})
Esempio n. 8
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value("country", 'uk')
     article.add_value("language", 'english')
     article.nested_css("div.main-content-column").add_xpath("body", './div/p//text()')
     article.add_xpath("headline", '//head/meta[@property="og:title"]/@content')
     # Function to parse published time to iso6801
     time_in = Compose(
         Join(),
         lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')
     )
     article.nested_css('meta[property="article:published_time"]').add_xpath(
         'published_time',
         './@content',
         time_in,
     )
     article.add_xpath("category", '//head/meta[@property="article:section"]/@content')
     article.add_xpath("keywords", '//head/meta[@name="keywords"]/@content')
     article.add_value("url", response.url)
     article.add_value("encoding", response.encoding)
     return article.load_item()
Esempio n. 9
0
    def parse_article(self, response):
        article = ItemLoader(item=NewsCrawlerItem(), response=response)
        article.add_value('country', 'uk')
        article.add_xpath('language', '//html/@lang')
        article.add_xpath('headline', '//head/meta[@property="og:title"]/@content')
        article.add_value('url', response.url)
        # Function to parse published time to iso6801
        time_in = Compose(
            Join(),
            lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')
        )
        article.nested_css('meta[property="article:published_time"]').add_xpath(
            'published_time',
            './@content',
            time_in,
        )
        article.add_xpath('category', '//head/meta[@property="keywords"]/@content', lambda v: v[0].split(',') if v else None)
        article.add_value('encoding', response.encoding)
        article.nested_css('div[itemprop="articleBody"]').add_xpath('body', './p//text()')

        return article.load_item()
Esempio n. 10
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value('country', 'usa')
     article.add_value('language', 'english')
     article.nested_css('meta[property="og:title"]').add_xpath(
         'headline', './@content')
     article.add_value('url', response.url)
     # Function to parse published time to iso6801
     time_in = Compose(
         Join(), lambda v: '' if (ciso8601.parse_datetime(v) is None) else
         ciso8601.parse_datetime(v).isoformat(sep='T'))
     article.nested_css('meta[name="pubdate"]').add_xpath(
         'published_time',
         './@content',
         time_in,
     )
     article.add_xpath('category', '//head/meta[@name="section"]/@content')
     article.add_xpath('keywords',
                       '//head/meta[@itemprop="keywords"]/@content',
                       re=r'(.*) -')
     article.add_value('encoding', response.encoding)
     article.nested_css('div.pg-rail-tall__body').nested_css(
         'div.l-container').add_xpath(
             'body', './/div[re:test(@class, "zn-.*")]/text()')
     return article.load_item()
Esempio n. 11
0
    def parse_book_brief(self, response):
        loader = ItemLoader(item=QuanbenBookItem(), selector=response)
        box = loader.nested_css('div.box')

        box.add_css('cover', 'img::attr(src)')
        box.add_css('name', 'h3 span::text')
        box.add_css('author_name', 'span[itemprop="author"]::text')
        box.add_css('category', 'span[itemprop="category"]::text')
        box.add_css('status', 'p:last-child span::text')
        box.add_css('brief', 'div.description *::text')
        item = loader.load_item()
        yield item

        chapter_list_url = response.css(
            'div.box a.button.s1::attr(href)').extract_first()
        yield response.follow(chapter_list_url,
                              callback=self.parse_chapter_list,
                              headers=self._HEADERS,
                              meta={'book_item': item})
Esempio n. 12
0
 def parse_article(self, response):
     article = ItemLoader(item=NewsCrawlerItem(), response=response)
     article.add_value('country', 'uk')
     article.add_value('language', 'english')
     article.nested_css('meta[property="og:title"]').add_xpath('headline', './@content', re=r'(.*) - BBC')
     article.add_value('url', response.url)
     # Function to parse published time to iso6801
     time_in = Compose(
         Join(),
         lambda v: '' if (ciso8601.parse_datetime(v) is None) else ciso8601.parse_datetime(v).isoformat(sep='T')
     )
     article.add_xpath(
         'published_time',
         '//*[@id="responsive-news"]/head/script[1]/text()',
         time_in,
         re=r'"datePublished": "(.*)"',
     )
     article.nested_css('meta[property="article:section"]').add_xpath('category', './@content')
     article.add_value('encoding', response.encoding)
     article.nested_css('div.story-body__inner').add_xpath('body', './p//text()')
     article.nested_css('map-body').add_xpath('body', './p//text()')
     return article.load_item()
Esempio n. 13
0
    def parse_person(self, response):
        loader = ItemLoader(item=Person(), response=response)
        loader.default_input_processor = processors.MapCompose(
            w3lib.html.remove_tags)
        loader.default_output_processor = processors.TakeFirst()

        loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()')
        loader.add_value('url', response.url)
        loader.add_xpath(
            'primary_role',
            '//*[@id="info-card-overview-content"]/div/dl/div/dd')

        # Fields expected: born, gender, location, website
        overview = response.xpath(
            '//*[@id="info-card-overview-content"]/div/dl/dt/text()')
        overview_loader = loader.nested_xpath(
            '//*[@id="info-card-overview-content"]/div/dl')
        for i in range(len(overview)):
            key = overview[i].extract()
            key = key[:key.find(':')].lower()
            try:
                overview_loader.add_xpath(key, 'dd[{}]/text()'.format(i + 1))
            except KeyError as e:
                # Ignore if key is not in the Item's field
                pass

        loader.add_xpath('facebook',
                         '(//a[contains(@class,"facebook")])[1]/@href')
        loader.add_xpath('twitter',
                         '(//a[contains(@class,"twitter")])[1]/@href')
        loader.add_xpath('linkedin',
                         '(//a[contains(@class,"linkedin")])[1]/@href')
        loader.add_xpath('description', '//*[@id="description"]/span/div')
        loader.add_css('current_jobs', '.current_job')
        loader.add_css('past_jobs', '.past_job')
        loader.nested_css('.advisory_roles').add_xpath('board_advisors',
                                                       './/ul/li')
        loader.nested_css('table.investors').add_xpath(
            'investments', './/tr[not(@class="thead")]')
        loader.nested_css('.education').add_xpath('education', './/ul/li')

        return loader.load_item()
Esempio n. 14
0
    def season_matches_parse(self, response):
        """
			Callback to handle parsing of the match list for each tournament season.
			Parses url such as https://www.rugbypass.com/{tournament}/matches/{season}
		"""
        #print(f"	    {response.url}")
        print(
            f"	    Parsing match list for tournament '{response.meta['tournament']}' in season '{response.meta['season']}'"
        )
        #print(f"Tournament: {response.meta['tournament']}, season/year: {response.meta['season']}")
        self.logger.info(
            f"############### Parsing match list for tournament '{response.meta['tournament']}' in season '{response.meta['season']}' ###############"
        )

        #css selectors for match details
        meta_fields = {
            'match_id': ' ::attr(data-id)',
            'date': ' ::attr(data-date)',
            'home_team_id': " ::attr(data-home)",
            #'home_team_name' : '',
            'away_team_id': " ::attr(data-away)",
            #'home_team_name': '',
            'status': " ::attr(data-status)",
            #'game_class': '',
            'start_time': " [itemprop='startDate']::attr(content)",
            'end_time': " [itemprop='endDate']::attr(content)",
            'performers': " [itemprop='performer']::attr(content)",
            'description': " [itemprop='description']::attr(content)",
            'rugbypass_price': " [itemprop='price']::attr(content)",
            'rugbypass_price_curr':
            " [itemprop='priceCurrency']::attr(content)",
            'title': "a.link-box::attr(href)",
            #'name': '',
            #'comp': '',
            #'comp_date': ''
            #'score_home': ''
            #'score_away': ''
        }

        match_ids = response.css(
            "div[class=game-round] div[itemscope] ::attr(data-id)").getall()

        for match_id in match_ids:

            #Extract basic match information into Match container
            loader = ItemLoader(item=Match(), response=response)
            #loader to handle each match listed on the season page individually
            match_loader = loader.nested_css(f"[data-id='{match_id}']")
            for field, selector in meta_fields.items():
                #load css selector for match details
                match_loader.add_css(field, selector)

            match_loader.add_value('tournament', response.meta['tournament'])
            match_loader.add_value('season', response.meta['season'])
            #fetch match data
            match = match_loader.load_item()
            yield match

            #Send request to parsed match page
            url = response.css(
                f"[data-id='{match_id}'] a.link-box::attr(href)").get()
            #print('url: ', urljoin(url, f"{response.meta['season']}/stats"))
            yield response.follow(url=urljoin(
                url, f"{response.meta['season']}/stats"),
                                  callback=self.match_stats_page_parse,
                                  meta=({
                                      'match': match,
                                  }))