Esempio n. 1
0
    def parse_estation(self, response):
        estacion_item_loader = ItemLoader(item=Estacion(), response=response)

        # Informacion de estacion
        # ---------------------------
        # linea
        # url_page
        # nombre
        # latitud_dms
        # longitud_dms
        # latitud_dec
        # longitud_dec

        # Informacion general
        info_general = estacion_item_loader.nested_xpath(
            '//*[@id="mw-content-text"]/div/table[1]')

        linea = response.meta['clave_linea']
        url_page = urlparse(response.url).path
        nombre = response.xpath(
            '//*[@id="mw-content-text"]/div/table[1]/tr[1]/th/text()').extract(
            )

        estacion_item_loader.add_value('linea', linea)
        estacion_item_loader.add_value('nombre', nombre)
        estacion_item_loader.add_value('url_page', url_page)

        estacion_item_loader.add_css('latitud_dms', '.geo-dms .latitude::text')
        estacion_item_loader.add_css('longitud_dms',
                                     '.geo-dms .longitude::text')

        estacion_item_loader.add_css('latitud_dec', '.geo-dec .latitude::text')
        estacion_item_loader.add_css('longitud_dec',
                                     '.geo-dec .longitude::text')

        # informacion de conexiones
        conexiones = response.css('.infobox table')
        conexion = conexiones[0]

        image = conexion.css('tr')[0].css('td')[3].css(
            'a.image').extract_first()

        # direccion down
        direccion_down = conexion.css('tr')[0].css('td')[1].css('b a')
        if direccion_down:
            direccion_down.extract()

        # direccion_up
        direccion_up = conexion.css('tr')[0].css('td')[5].css('b a')
        if direccion_up:
            direccion_up.extract()

        yield estacion_item_loader.load_item()
Esempio n. 2
0
    def parse_learn_more(self, response):
        l = ItemLoader(item=ei.CPCCandidate(), selector=response)

        l.add_value("riding", response.meta.get("riding"))

        title = l.nested_xpath("//div[@class='cell text-center']")
        title.add_xpath("name", "./h1/text()")
        title.add_xpath("nomination_dt",
                        ".//p[@class='nomination_date']/text()")
        title.add_xpath("cabinet_position", "./p[not(@class)]/text()")

        social = l.nested_xpath(
            "//section[@class='section section--social-share']")
        social.add_xpath("donate", ".//a[@data-type='donate']/@href")
        social.add_xpath("website", ".//a[@data-type='website']/@href")
        social.add_xpath("facebook", ".//a[@data-type='facebook']/@href")
        social.add_xpath("twitter", ".//a[@data-type='twitter']/@href")
        social.add_xpath("instagram", ".//a[@data-type='instagram']/@href")

        l.add_xpath("photo", "//img[@class='team-bio-image']/@src")
        l.add_xpath("bio",
                    "//section[@class='section section--text-block']//text()")

        yield l.load_item()
Esempio n. 3
0
    def fetchLatestRadio(self, response):
        self.logger.debug(
            "[%s]=====================Saavn Radio list ========================================"
            % self.loggerName)

        # fetch albums details
        loader = ItemLoader(item=Radio(), response=response)
        radioLoader = loader.nested_xpath(
            '//div[contains(@class, "album-details")]')
        radioLoader.add_xpath('name', 'p/text()')
        loadedRadio = radioLoader.load_item()
        self.showRadioDetails(loadedRadio)
        self.logger.debug(
            "[%s]============================================================="
            % self.loggerName)

        return loadedRadio
Esempio n. 4
0
    def parse_product(self, response):
        selector = response.selector.xpath('//section[@class="row"]')
        loader = ItemLoader(item=response.meta["item"], selector=selector)
        # meta_loader = ItemLoader(item=ProductItemMeta(), selector=selector)
        loader.add_xpath('detail_name', './/h1[@itemprop="name"]/text()')
        loader.add_xpath('brand', './/h5[@itemprop="brand"]/text()')
        loader.add_xpath(
            'description',
            './/div[@class="col-xs-12 col-sm-12 col-md-12 col-lg-12"]/p/text()'
        )

        _loader = loader.nested_xpath('//select[@id="__sku"]/option')
        _loader.add_xpath('price', './/@data-priceformat')
        _loader.add_xpath('size_format', './/text()')

        loader.selector = response.selector.xpath(
            '//div[@id="accordion"]/div[@class="panel panel-default"]')
        loader.add_xpath(
            'detail_description',
            './/div[@id="collapseOne"]/div/descendant-or-self::*/text()')
        loader.add_xpath(
            'detail_ingredients',
            './/div[@id="collapseTwo"]/div/descendant-or-self::*/text()')
        loader.add_xpath(
            'nutritional_facts',
            './/div[@id="collapseThree"]/div/descendant-or-self::*/text()')
        loader.add_xpath('nutritional_facts_img_url',
                         './/*[@id="collapseThree"]/div/p/img/@src')

        loader.selector = response.selector.xpath(
            '//*[@id="review"]/div/div/div')
        loader.add_xpath('customer_review_header',
                         './/h3[@class="panel-title"]/text()')
        ratings = []
        for _ in loader.selector:
            rating = ''.join(_.xpath('.//label/text()').getall())
            ratings.append(rating)
        loader.add_value('customer_review_rating', ratings)
        loader.add_xpath(
            'customer_review',
            './/blockquote[@class="blockquote-reverse"]/p/text()')

        self.log(f'finished parsing product page {response.url}')
        return loader.load_item()
Esempio n. 5
0
    def parse_person(self, response):
        loader = ItemLoader(item=Person(), response=response)
        loader.default_input_processor = processors.MapCompose(
            w3lib.html.remove_tags)
        loader.default_output_processor = processors.TakeFirst()

        loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()')
        loader.add_value('url', response.url)
        loader.add_xpath(
            'primary_role',
            '//*[@id="info-card-overview-content"]/div/dl/div/dd')

        # Fields expected: born, gender, location, website
        overview = response.xpath(
            '//*[@id="info-card-overview-content"]/div/dl/dt/text()')
        overview_loader = loader.nested_xpath(
            '//*[@id="info-card-overview-content"]/div/dl')
        for i in range(len(overview)):
            key = overview[i].extract()
            key = key[:key.find(':')].lower()
            try:
                overview_loader.add_xpath(key, 'dd[{}]/text()'.format(i + 1))
            except KeyError as e:
                # Ignore if key is not in the Item's field
                pass

        loader.add_xpath('facebook',
                         '(//a[contains(@class,"facebook")])[1]/@href')
        loader.add_xpath('twitter',
                         '(//a[contains(@class,"twitter")])[1]/@href')
        loader.add_xpath('linkedin',
                         '(//a[contains(@class,"linkedin")])[1]/@href')
        loader.add_xpath('description', '//*[@id="description"]/span/div')
        loader.add_css('current_jobs', '.current_job')
        loader.add_css('past_jobs', '.past_job')
        loader.nested_css('.advisory_roles').add_xpath('board_advisors',
                                                       './/ul/li')
        loader.nested_css('table.investors').add_xpath(
            'investments', './/tr[not(@class="thead")]')
        loader.nested_css('.education').add_xpath('education', './/ul/li')

        return loader.load_item()
    def parse(self, response):

        loader = ItemLoader(DetailedTeamStatItem(), response=response)

        team_spacing_loader = loader.nested_xpath(
            ".//div[contains(@class, 'team')]")

        loader.add_value('last_modified', datetime.utcnow())

        # GENERAL CLUB INFORMATION

        loader.add_xpath('id', ".//div[@class='info']/h1/text()")
        loader.add_xpath('club_name', ".//div[@class='info']/h1/text()")
        loader.add_xpath('division',
                         ".//div[contains(@class, 'meta')]//a[last()]/text()")
        loader.add_xpath('club_logo',
                         ".//div[contains(@class, 'card')]/img/@data-src")
        loader.add_xpath('flag',
                         ".//div[contains(@class, 'meta')]//img/@data-src")

        # GENERAL TEAM STATS

        loader.add_xpath(
            'overall',
            ".//div[contains(@class, 'stats')]/div/div[1]/span/text()")
        loader.add_xpath(
            'attack',
            ".//div[contains(@class, 'stats')]/div/div[2]/span/text()")
        loader.add_xpath(
            'midfield',
            ".//div[contains(@class, 'stats')]/div/div[3]/span/text()")
        loader.add_xpath(
            'defence',
            ".//div[contains(@class, 'stats')]/div/div[4]/span/text()")

        # DETAILED TEAM STATS

        # Note: this stat seams to be missing as of 06/17/2019
        team_spacing_loader.add_xpath(
            'home_stadium',
            "./ul/li/following::label[contains(., 'Home Stadium')]"\
            "/following::text()[1]"
        )
        team_spacing_loader.add_xpath(
            'rival_team',
            "./ul/li/following::label[contains(., 'Rival Team')]"\
            "/following::a[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'international_prestige',
            "./ul/li/following::label[contains(., 'International Prestige')]"\
            "/following::span[1]/text()"
        )
        team_spacing_loader.add_xpath(
            'domestic_prestige',
            "./ul/li/following::label[contains(., 'Domestic Prestige')]"\
            "/following::span[1]/text()"
        )
        team_spacing_loader.add_xpath(
            'transfer_budget',
            "./ul/li/following::label[contains(., 'Domestic Prestige')]"\
            "/following::label[contains(., 'Transfer Budget')]"\
            "/following::text()[1]"
        )
        team_spacing_loader.add_xpath(
            'starting_xi_average_age',
            "./ul/li/following::label[contains(., 'Starting XI Average Age')]"\
            "/following::text()[1]"
        )
        team_spacing_loader.add_xpath(
            'whole_team_average_age',
            "./ul/li/following::label[contains(., 'Whole Team Average Age')]"\
            "/following::text()[1]"
        )
        team_spacing_loader.add_xpath(
            'captain',
            "./ul/li/following::label[contains(., 'Captain')]"\
            "/following::a[1]/@href"
        )
        loader.add_xpath(
            'short_free_kick',
            "(.//div[contains(@class, 'team')]/ul/li"\
            "/following::label[contains(., 'Short Free Kick')]"\
            "/following::a[1])[1]/@href"
        )
        loader.add_xpath(
            'long_free_kick',
            "(.//div[contains(@class, 'team')]/ul/li"\
            "/following::label[contains(., 'Long Free Kick')]"\
            "/following::a[1])[1]/@href"
        )
        loader.add_xpath(
            'left_short_free_kick',
            "(.//div[contains(@class, 'team')]/ul/li"\
            "/following::label[contains(., 'Left Short Free Kick')]"\
            "/following::a[1])[1]/@href"
        )
        loader.add_xpath(
            'right_short_free_kick',
            "(.//div[contains(@class, 'team')]/ul/li"\
            "/following::label[contains(., 'Right Short Free Kick')]"\
            "/following::a[1])[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'penalties',
            "./ul/li/following::label[contains(., 'Penalties')]"\
            "/following::a[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'left_corner',
            "./ul/li/following::label[contains(., 'Left Corner')]"\
            "/following::a[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'right_corner',
            "./ul/li/following::label[contains(., 'Right Corner')]"\
            "/following::a[1]/@href"
        )
        team_spacing_loader.add_xpath(
            'starting_xi', ".//div[contains(@class, 'lineup')]/div/a/@href")

        # TACTICS

        loader.add_xpath(
            'defence_defensive_style',
            ".//dl//span/preceding::dd[text()='Defensive Style']/span/span/"\
            "text()"
        )
        loader.add_xpath(
            'defence_team_width',
            "(.//dl//span/preceding::span[text()='Team Width']"\
            "/following::span[1]/span/text())[1]"
        )
        loader.add_xpath(
            'defence_depth',
            ".//dl//span/preceding::span[text()='Depth']/following::span[1]"\
            "/span/text()"
        )
        loader.add_xpath(
            'offense_offensive_style',
            ".//dl//span/preceding::dd[text()='Offensive Style']/span/span/"\
            "text()"
        )
        loader.add_xpath(
            'offense_width',
            ".//dl//span/preceding::span[text()='Width']/following::span[1]"\
            "/span/text()"
        )
        loader.add_xpath(
            'offense_players_in_box',
            ".//dl//span/preceding::span[text()='Players in box']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'offense_corners',
            ".//dl//span/preceding::span[text()='Corners']/following::span[1]"\
            "/span/text()"
        )
        loader.add_xpath(
            'offense_free_kicks',
            ".//dl//span/preceding::span[text()='Free Kicks']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'build_up_play_speed',
            ".//dl//span/preceding::span[text()='Speed']/following::span[1]"\
            "/span/text()"
        )
        loader.add_xpath(
            'build_up_play_dribbling',
            ".//dl//span/preceding::dd[text()='Dribbling']/span/span/text()")
        loader.add_xpath(
            'build_up_play_passing',
            "(.//dl//span/preceding::span[text()='Passing']"\
            "/following::span[1]/span/text())[1]"
        )
        loader.add_xpath(
            'build_up_play_positioning',
            "(.//dl//span/preceding::span[text()='Positioning'])[1]"\
            "/following::span[1]/text()"
        )
        loader.add_xpath(
            'chance_creation_passing',
            "(.//dl//span/preceding::span[text()='Passing']"\
            "/following::span[1]/span/text())[2]"
        )
        loader.add_xpath(
            'chance_creation_crossing',
            ".//dl//span/preceding::span[text()='Crossing']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'chance_creation_shooting',
            ".//dl//span/preceding::span[text()='Shooting']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'chance_creation_positioning',
            "(.//dl//span/preceding::span[text()='Positioning'])[2]"\
            "/following::span[1]/text()"
        )
        loader.add_xpath(
            'defence_extra_pressure',
            ".//dl//span/preceding::span[text()='Pressure']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'defence_extra_aggression',
            ".//dl//span/preceding::span[text()='Aggression']"\
            "/following::span[1]/span/text()"
        )
        loader.add_xpath(
            'defence_extra_team_width',
            "(.//dl//span/preceding::span[text()='Team Width']"\
            "/following::span[1]/span/text())[2]"
        )
        loader.add_xpath(
            'defence_extra_defender_line',
            ".//span[text()='Defender Line']/following::span/text()")

        # PLAYERS

        loader.add_xpath(
            'squad',
            "(.//table)[1]/tbody/tr//a[contains(@href, '/player/')]/@href")
        loader.add_xpath(
            'on_loan',
            "(.//table)[2]/tbody/tr//a[contains(@href, '/player/')]/@href")

        # MEDIA

        loader.add_xpath(
            'kits', ".//div[@class='column col-sm-5 text-center']//img/@src")

        # COMMUNITY

        loader.add_xpath(
            'likes',
            "(//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[1]"
        )
        loader.add_xpath(
            'dislikes',
            "(//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[2]"
        )

        print(response.request.headers['User-Agent'])

        self.logger.info(f'Parse function called on {response.url}')

        yield loader.load_item()
    def parse(self, response):

        self.crawler.stats.set_value('pages_to_visit', len(self.urls))

        loader = ItemLoader(item=SofifaItem(), response=response)

        col_4_loader = loader.nested_xpath(
            ".//div[@class='column col-4 text-center']")

        loader.add_value('last_modified', datetime.utcnow())

        # GENERAL PLAYER INFORMATION

        loader.add_xpath('id', ".//div[@class='info']/h1/text()")
        loader.add_xpath('name', ".//div[@class='info']/h1/text()")
        loader.add_xpath('full_name',
                         ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('age', ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('dob', ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('height',
                         ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('weight',
                         ".//div[contains(@class, 'meta')]/text()[1]")
        loader.add_xpath('nationality',
                         ".//div[contains(@class, 'meta')]/a/@title")

        # GENERAL PLAYER STATS

        loader.add_xpath(
            'preferred_foot',
            "(.//label[text()='Preferred Foot']/following::text())[1]")
        loader.add_xpath(
            'international_reputation',
            "(.//label[text()='International Reputation']"\
            "/following::text())[1]"
        )
        loader.add_xpath(
            'weak_foot', "(.//label[text()='Weak Foot']/following::text())[1]")
        loader.add_xpath(
            'skill_moves',
            "(.//label[text()='Skill Moves']/following::text())[1]")
        loader.add_xpath(
            'work_rate',
            "(.//label[text()='Work Rate']/following::span/text())[1]")
        loader.add_xpath(
            'body_type',
            "(.//label[text()='Body Type']/following::span/text())[1]")
        loader.add_xpath(
            'real_face',
            "(.//label[text()='Real Face']/following::span/text())[1]")

        # CLUB/TEAM INFORMATION

        col_4_loader.add_xpath(
            'value',
            "/following::text()[contains(., 'Value')]"\
            "/following::span[1]/text()"
        )

        col_4_loader.add_xpath(
            'wage',
            "/following::text()[contains(., 'Wage')]/following::span[1]/text()"
        )
        loader.add_xpath(
            'release_clause',
            "(.//label[text()='Release Clause']/following::span/text())[1]")
        loader.add_xpath('club_name',
                         "(.//ul[contains(@class, 'pl')]//a/text())[1]")
        loader.add_xpath('club_url',
                         "(.//ul[contains(@class, 'pl')]//a/@href)[1]")
        loader.add_xpath(
            'club_rating',
            ".//div[contains(@class, 'column col-5')][1]//li[2]/span[1]/text()"
        )
        loader.add_xpath(
            'club_position',
            "(.//label[text()='Position']/following::text()[1])[1]")
        loader.add_xpath(
            'club_jersey_number',
            "(.//label[text()='Jersey Number']/following::text()[1])[1]")
        loader.add_xpath('club_join_date',
                         ".//label[text()='Joined']/following::text()[1]")
        loader.add_xpath(
            'loaned_from',
            ".//label[text()='Loaned From']/following::a[1]/text()")
        loader.add_xpath(
            'club_contract_end_date',
            ".//label[text()='Contract Valid Until']/following::text()[1]")
        loader.add_xpath('team_name',
                         "(.//ul[contains(@class, 'pl')]//a/text())[last()]")
        loader.add_xpath(
            'team_rating',
            ".//div[contains(@class, 'column col-5')][last()]//li[2]/span[1]"\
            "/text()"
        )
        loader.add_xpath(
            'team_position',
            "(.//label[text()='Position']/following::text()[1])[last()]")
        loader.add_xpath(
            'team_jersey_number',
            "(.//label[text()='Jersey Number']/following::text()[1])[last()]")

        # PLAYER GAME STATS

        loader.add_xpath(
            'overall_rating',
            "//div[@class='column col-4 text-center'][1]/span/text()")
        col_4_loader.add_xpath(
            'potential_rating',
            "//div[@class='column col-4 text-center'][2]/span/text()")
        loader.add_xpath('positions',
                         ".//div[contains(@class, 'meta')]/span/text()")
        loader.add_xpath('unique_attributes',
                         ".//div[contains(@class, 'mt-2')]/a/text()")

        if 'GK' in response.xpath(".//div[contains(@class, 'meta')]"\
                                  "/span/text()").getall():

            loader.add_xpath(
                'DIV',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'HAN',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'KIC',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'REF',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'SPD',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'POS',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )

        else:

            loader.add_xpath(
                'PAC',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'SHO',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'PAS',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'DRI',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'DEF',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )
            loader.add_xpath(
                'PHY',
                "(.//div[@class='wrapper']"\
                "//script)[1][contains(text(), 'var')]/text()"
            )

        # PLAYER DETAILED STATS

        loader.add_xpath('crossing', "(.//span[../span='Crossing']/text())[1]")
        loader.add_xpath('finishing',
                         "(.//span[../span='Finishing']/text())[1]")
        loader.add_xpath('heading_accuracy',
                         "(.//span[../span='Heading Accuracy']/text())[1]")
        loader.add_xpath('short_passing',
                         "(.//span[../span='Short Passing']/text())[1]")
        loader.add_xpath('volleys', "(.//span[../span='Volleys']/text())[1]")
        loader.add_xpath('aggression',
                         "(.//span[../span='Aggression']/text())[1]")
        loader.add_xpath('interceptions',
                         "(.//span[../span='Interceptions']/text())[1]")
        loader.add_xpath('positioning',
                         "(.//span[../span='Positioning']/text())[1]")
        loader.add_xpath('vision', "(.//span[../span='Vision']/text())[1]")
        loader.add_xpath('penalties',
                         "(.//span[../span='Penalties']/text())[1]")
        loader.add_xpath('composure',
                         ".//li[contains(text(), 'Composure')]/span/text()")
        loader.add_xpath('dribbling',
                         "(.//span[../span='Dribbling']/text())[1]")
        loader.add_xpath('curve', "(.//span[../span='Curve']/text())[1]")
        loader.add_xpath('fk_accuracy',
                         "(.//span[../span='FK Accuracy']/text())[1]")
        loader.add_xpath('long_passing',
                         "(.//span[../span='Long Passing']/text())[1]")
        loader.add_xpath('ball_control',
                         "(.//span[../span='Ball Control']/text())[1]")
        loader.add_xpath('marking', "(.//span[../span='Marking']/text())[1]")
        loader.add_xpath('standing_tackle',
                         "(.//span[../span='Standing Tackle']/text())[1]")
        loader.add_xpath('sliding_tackle',
                         "(.//span[../span='Sliding Tackle']/text())[1]")
        loader.add_xpath('acceleration',
                         "(.//span[../span='Acceleration']/text())[1]")
        loader.add_xpath('sprint_speed',
                         "(.//span[../span='Sprint Speed']/text())[1]")
        loader.add_xpath('agility', "(.//span[../span='Agility']/text())[1]")
        loader.add_xpath('reactions',
                         "(.//span[../span='Reactions']/text())[1]")
        loader.add_xpath('balance', "(.//span[../span='Balance']/text())[1]")
        loader.add_xpath('gk_diving',
                         ".//li[contains(text(), 'GK Diving')]/span/text()")
        loader.add_xpath('gk_handling',
                         ".//li[contains(text(), 'GK Handling')]/span/text()")
        loader.add_xpath('gk_kicking',
                         ".//li[contains(text(), 'GK Kicking')]/span/text()")
        loader.add_xpath(
            'gk_positioning',
            ".//li[contains(text(), 'GK Positioning')]/span/text()")
        loader.add_xpath('gk_reflexes',
                         ".//li[contains(text(), 'GK Reflexes')]/span/text()")
        loader.add_xpath('shot_power',
                         "(.//span[../span='Shot Power']/text())[1]")
        loader.add_xpath('jumping', "(.//span[../span='Jumping']/text())[1]")
        loader.add_xpath('stamina', "(.//span[../span='Stamina']/text())[1]")
        loader.add_xpath('strength', "(.//span[../span='Strength']/text())[1]")
        loader.add_xpath('long_shots',
                         "(.//span[../span='Long Shots']/text())[1]")
        loader.add_xpath(
            'traits',
            ".//h5[text()='Traits']/following-sibling::ul/li/span/text()")

        # PLAYER REAL OVERALL RATING (POSITIONAL STATS)

        loader.add_xpath('LS', "(.//div[../div='LS']/following::text())[1]")
        loader.add_xpath('ST', "(.//div[../div='ST']/following::text())[1]")
        loader.add_xpath('RS', "(.//div[../div='RS']/following::text())[1]")
        loader.add_xpath('LW', "(.//div[../div='LW']/following::text())[1]")
        loader.add_xpath('LF', "(.//div[../div='LF']/following::text())[1]")
        loader.add_xpath('CF', "(.//div[../div='CF']/following::text())[1]")
        loader.add_xpath('RF', "(.//div[../div='RF']/following::text())[1]")
        loader.add_xpath('RW', "(.//div[../div='RW']/following::text())[1]")
        loader.add_xpath('LAM', "(.//div[../div='LAM']/following::text())[1]")
        loader.add_xpath('CAM', "(.//div[../div='CAM']/following::text())[1]")
        loader.add_xpath('RAM', "(.//div[../div='RAM']/following::text())[1]")
        loader.add_xpath('LM', "(.//div[../div='LM']/following::text())[1]")
        loader.add_xpath('LCM', "(.//div[../div='LCM']/following::text())[1]")
        loader.add_xpath('CM', "(.//div[../div='CM']/following::text())[1]")
        loader.add_xpath('RCM', "(.//div[../div='RCM']/following::text())[1]")
        loader.add_xpath('RM', "(.//div[../div='RM']/following::text())[1]")
        loader.add_xpath('LWB', "(.//div[../div='LWB']/following::text())[1]")
        loader.add_xpath('LDM', "(.//div[../div='LDM']/following::text())[1]")
        loader.add_xpath('CDM', "(.//div[../div='CDM']/following::text())[1]")
        loader.add_xpath('RDM', "(.//div[../div='RDM']/following::text())[1]")
        loader.add_xpath('RWB', "(.//div[../div='RWB']/following::text())[1]")
        loader.add_xpath('LB', "(.//div[../div='LB']/following::text())[1]")
        loader.add_xpath('LCB', "(.//div[../div='LCB']/following::text())[1]")
        loader.add_xpath('CB', "(.//div[../div='CB']/following::text())[1]")
        loader.add_xpath('RCB', "(.//div[../div='RCB']/following::text())[1]")
        loader.add_xpath('RB', "(.//div[../div='RB']/following::text())[1]")

        # COMMUNITY INFORMATION

        loader.add_xpath(
            'followers',
            "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[3]"
        )
        loader.add_xpath(
            'likes',
            "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[1]"
        )
        loader.add_xpath(
            'dislikes',
            "(.//div[contains(@class, 'operation spacing')]/a/span[2]/span"\
            "/text())[2]"
        )

        # MEDIA

        loader.add_xpath('face_img', ".//div/div/article/div/img//@data-src")
        loader.add_xpath('flag_img',
                         ".//div[contains(@class, 'meta')]/a/img/@data-src")
        loader.add_xpath('club_logo_img',
                         "(.//div/ul/li/figure/img/@data-src)[1]")
        loader.add_xpath('team_logo_img',
                         "(.//div/ul/li/figure/img/@data-src)[last()]")

        self.logger.info(f'Parse function called on {response.url}')

        self.logger.info(
            f"Currently on page "\
            f"{self.crawler.stats.get_value('page_counter')} out of "\
            f"{self.crawler.stats.get_value('pages_to_visit')}"
        )

        # TODO: enable continued logging of page_counter after a pause/resume.

        self.crawler.stats.inc_value(key='page_counter', count=1, start=0)

        print(response.request.headers['User-Agent'])

        print(f"{self.crawler.stats.get_value('page_counter')} "\
              f"out of {self.crawler.stats.get_value('pages_to_visit')}")

        yield loader.load_item()
Esempio n. 8
0
<footer>
<a class="social" href="http://facebook.com/whatever">Like Us</a>
<a class="social" href="http://twitter.com/whatever">Follow Us</a>
<a class="email" href="mailto:[email protected]">Email Us</a>
</footer>
'''

loader = ItemLoader(item=Item())
# load stuff not in the footer
loader.add_xpath('social', '//footer/a[@class = "social"]/@href')
loader.add_xpath('email', '//footer/a[@class = "email"]/@href')
loader.load_item()

loader = ItemLoader(item=Item())
# load stuff not in the footer
footer_loader = loader.nested_xpath('//footer')
footer_loader.add_xpath('social', 'a[@class = "social"]/@href')
footer_loader.add_xpath('email', 'a[@class = "email"]/@href')
# no need to call footer_loader.load_item()
loader.load_item()

6. Reusing and extending item loaders
from scrapy.loader.processors import MapCompose
from myproject.ItemLoaders import ProductLoader
def strip_dashes(x):
    return x.strip('-')

class SiteSpecificLoader(ProductLoader):
    name_in = MapCompose(strip_dashes, ProductLoader.name_in)

from scrapy.loader.processors import MapCompose
Esempio n. 9
0
    def parse(self, response):

        loader = ItemLoader(NationalTeamDetailedStats(), response=response)
        mt_2_loader = loader.nested_xpath(".//div[@class='operation mt-2']/a")
        col_6_loader = loader.nested_xpath(".//div[@class='column col-6']")

        # GENERAL CLUB INFORMATION

        loader.add_xpath('id', ".//div[@class='info']/h1/text()")
        loader.add_xpath('team_name', ".//div[@class='info']/h1/text()")
        loader.add_xpath('team_logo', ".//div[@class='card card-border player fixed-width']/img/@data-src")
        loader.add_xpath('flag', ".//div[@class='meta']//a[last()-1]//img/@data-src")

        # GENERAL TEAM STATS

        loader.add_xpath('overall', "(.//div[@class='column col-4 text-center']"
                                    "/preceding::text()[contains(.,'Overall')])[2]/following::span[1]/text()")
        loader.add_xpath('attack', "(.//div[@class='column col-4 text-center']"
                                   "/preceding::text()[contains(.,'Attack')])[2]/following::span[1]/text()")
        loader.add_xpath('midfield', "(.//div[@class='column col-4 text-center']"
                                     "/preceding::text()[contains(.,'Midfield')])[2]/following::span[1]/text()")
        loader.add_xpath('defence', "(.//div[@class='column col-4 text-center']"
                                    "/following::text()[contains(.,'Defence')])[1]/following::span[1]/text()")

        # DETAILED TEAM STATS

        col_6_loader.add_xpath('home_stadium', ".//following::label[contains(., 'Home Stadium')]/following::text()[1]")
        col_6_loader.add_xpath('rival_team', ".//following::label[contains(., 'Rival Team')]/following::a[1]/text()")
        col_6_loader.add_xpath('international_prestige', ".//following::label[contains(., 'International Prestige')]"
                                                         "/following::span[1]/text()")
        col_6_loader.add_xpath('starting_xi_average_age', ".//following::label[contains(., 'Starting XI Average Age')]"
                                                          "/following::text()[1]")
        col_6_loader.add_xpath('whole_team_average_age', ".//following::label[contains(., 'Whole Team Average Age')]"
                                                         "/following::text()[1]")
        col_6_loader.add_xpath('captain', ".//following::label[contains(., 'Captain')]/following::a[1]/@href")
        col_6_loader.add_xpath('short_free_kick', ".//following::label[text()='Short Free Kick']/following::a[1]/@href")
        col_6_loader.add_xpath('long_free_kick', ".//following::label[text()='Long Free Kick']/following::a[1]/@href")
        col_6_loader.add_xpath('left_short_free_kick', ".//following::label[text()='Left Short Free Kick']"
                                                       "/following::a[1]/@href")
        col_6_loader.add_xpath('right_short_free_kick', ".//following::label[text()='Right Short Free Kick']"
                                                        "/following::a[1]/@href")
        col_6_loader.add_xpath('penalties', ".//following::label[text()='Penalties']/following::a[1]/@href")
        col_6_loader.add_xpath('left_corner', ".//following::label[text()='Left Corner']/following::a[1]/@href")
        col_6_loader.add_xpath('right_corner', ".//following::label[text()='Right Corner']/following::a[1]/@href")
        loader.add_xpath('starting_xi', ".//div[@class='field-player']/a/@href")

        # TACTICS

        loader.add_xpath('defence_defensive_style', ".//dl//span/preceding::dd[text()='Defensive Style']"
                                                    "/span/span/text()")
        loader.add_xpath('defence_team_width', "(.//dl//span/preceding::span[text()='Team Width']"
                                               "/following::div/meter)[1]/@value")
        loader.add_xpath('defence_depth', "(.//dl//span/preceding::span[text()='Depth']"
                                          "/following::div/meter)[1]/@value")
        loader.add_xpath('offense_offensive_style', ".//dl//span/preceding::dd[text()='Offensive Style']"
                                                    "/span/span/text()")
        loader.add_xpath('offense_width', "(.//dl//span/preceding::span[text()='Width']/following::div/meter)[1]"
                                          "/@value")
        loader.add_xpath('offense_players_in_box', "(.//dl//span/preceding::span[text()='Players in box']"
                                                   "/following::div/meter)[1]/@value")
        loader.add_xpath('offense_corners', "(.//dl//span/preceding::span[text()='Corners']"
                                            "/following::div/meter)[1]/@value")
        loader.add_xpath('offense_free_kicks', "(.//dl//span/preceding::span[text()='Free Kicks']"
                                               "/following::div/meter)[1]/@value")
        loader.add_xpath('build_up_play_speed', ".//dl//span/preceding::span[text()='Speed']/following::span/text()")
        loader.add_xpath('build_up_play_dribbling', "(.//dl//span/preceding::dd[text()='Dribbling']//span)[1]"
                                                    "/span/text()")
        loader.add_xpath('build_up_play_passing', "(.//dl//span/preceding::span[text()='Passing']/following::span)[1]"
                                                  "/span/text()")
        loader.add_xpath('build_up_play_positioning', "(.//dl//span/preceding::span[text()='Positioning'])[1]"
                                                      "/following::span[1]/text()")
        loader.add_xpath('chance_creation_passing', "(.//dl//span/preceding::span[text()='Shooting']"
                                                    "/following::span)[1]/span/text()")
        loader.add_xpath('chance_creation_crossing', "(.//dl//span/preceding::span[text()='Crossing']"
                                                     "/following::span)[1]/span/text()")
        loader.add_xpath('chance_creation_shooting', "(.//dl//span/preceding::span[text()='Shooting']"
                                                     "/following::span)[1]/span/text()")
        loader.add_xpath('chance_creation_positioning', "(.//dl//span/preceding::span[text()='Positioning'])[2]"
                                                        "/following::span[1]/text()")
        loader.add_xpath('defence_extra_pressure', "(.//dl//span/preceding::span[text()='Pressure']"
                                                   "/following::span)[1]/span/text()")
        loader.add_xpath('defence_extra_aggression', "(.//dl//span/preceding::span[text()='Aggression']"
                                                     "/following::span)[1]/span/text()")
        loader.add_xpath('defence_extra_team_width', "(.//span[text()='Team Width'])[2]/following::span[1]/span/text()")
        loader.add_xpath('defence_extra_defender_line', ".//span[text()='Defender Line']/following::span/text()")

        # PLAYERS

        loader.add_xpath('squad', "(.//table)[1]/tbody/tr//a[contains(@href, '/player/')]/@href")
        loader.add_xpath('on_loan', "(.//table)[2]/tbody/tr//a[contains(@href, '/player/')]/@href")

        # MEDIA

        loader.add_xpath('kits', ".//div[@class='column col-sm-5 text-center']//img/@src")

        # COMMUNITY

        mt_2_loader.add_xpath('likes', "text()[contains(.,'Like')]/following::span[1]/text()")
        mt_2_loader.add_xpath('dislikes', "text()[contains(.,'Dislike')]/following::span[1]/text()")

        print(response.request.headers['User-Agent'])
        self.logger.info(f'Parse function called on {response.url}')

        yield loader.load_item()
Esempio n. 10
0
    def parse_property_page(self, response):
        # overview
        il = ItemLoader(item=overview_item(), response=response)
        il.add_value('url', response.url)
        overview_node = il.nested_xpath(
            '//div[@data-testid="home-details-summary-container"]')
        overview_node.add_xpath(
            'address',
            './/span[@data-testid="home-details-summary-headline"]/text()')
        overview_node.add_xpath(
            'city_state',
            './/span[@data-testid="home-details-summary-city-state"]/text()')
        overview_node.add_xpath(
            'price',
            './/*[@data-testid="on-market-price-details"]//text()',
            re=r'\$([\d,]+)')
        overview_node.add_xpath('area',
                                xpath='.//li//text()',
                                re=r'^([\d,]+)\s?sqft$')
        overview_node.add_xpath('bedrooms',
                                xpath='.//li//text()',
                                re=r'(\d+\.?\d?) (?:Beds|Bed|beds|bed)$')
        overview_node.add_xpath('bathrooms',
                                xpath='.//li//text()',
                                re=r'(\d+\.?\d?) (?:Baths|Bath|baths|bath)$')

        details = il.nested_xpath('//div[@data-testid="features-container"]')
        details.add_xpath('year_built',
                          xpath='.//li//text()',
                          re='Built in (\d+)')
        details.add_xpath('lot_size',
                          xpath='.//li//text()',
                          re=r'Lot Size: ([\d,.]+) (?:acres|sqft)$')
        details.add_xpath('lot_size_units',
                          xpath='.//li//text()',
                          re=r'Lot Size: [\d,.]+ (acres|sqft)$')
        details.add_xpath('price_per_square_foot',
                          xpath='.//li//text()',
                          re=r'\$([\d,.]+)/sqft$')
        details.add_xpath('days_on_Trulia',
                          xpath='.//li//text()',
                          re=r'([\d,]+)\+? Days on Trulia$')
        overview_dict = il.load_item()

        # local info
        local_info_list = response.xpath(
            '(//*[div="Local Information"]/parent::div)[2]/following-sibling::div/div/div//text()'
        ).extract()
        # for i in range(len(local_info_list) - 1, -1, -1):
        #     if "Map View" in local_info_list[i] or "Street View" in local_info_list[i]:
        #         local_info_list.remove(local_info_list[i])
        local_dict_values = '\n'.join(local_info_list)

        # price_history
        il = ItemLoader(item=price_item(), response=response)
        table_xpath = '//div[contains(text(), "Price History for")]/../../following-sibling::table'
        il.add_xpath('dates', table_xpath + '//tr[1]/td[1]//text()')
        il.add_xpath('prices', table_xpath + '//tr[1]/td[2]//text()')
        il.add_xpath('events', table_xpath + '//tr[1]/td[3]//text()')
        price_dict = il.load_item()

        # tax info
        il = ItemLoader(item=taxes_item(), response=response)
        table_xpath = '//*[div="Property Taxes and Assessment"]/parent::div/following-sibling::table'
        il.add_xpath('property_tax_assessment_year',
                     table_xpath + '//tr[1]/td[1]//text()')
        il.add_xpath('property_tax', table_xpath + '//tr[2]/td[1]//text()')
        il.add_xpath('property_tax_assessment_land',
                     table_xpath + '//tr[4]/td[1]//text()')
        il.add_xpath('property_tax_assessment_improvements',
                     table_xpath + '//tr[5]/td[1]//text()')
        il.add_xpath('property_tax_assessment_total',
                     table_xpath + '//tr[6]/td[1]//text()')
        tax_dict = il.load_item()

        # 有的“可比较”模块不存在
        comparable_path = '//div[contains(text(), "Comparable Sales")]/../../following-sibling::div[3]'
        header = response.xpath(comparable_path + '//th//text()').extract()
        header.append('url')
        num_tr = len(response.xpath(comparable_path + '//tbody/tr'))
        rows = []
        for i in range(1, num_tr + 1):
            rows.append(
                response.xpath(
                    (comparable_path +
                     '//tbody/tr[{:d}]//text()').format(i)).extract())
        urls = response.xpath(comparable_path + '//tbody//a/@href').extract()
        urls = [get_rel_url(response.url, url) for url in urls]
        [rows[i].append(urls[i]) for i in range(num_tr)]
        comparable_list = [list(zip(header, row)) for row in rows]

        # price_trends
        il = ItemLoader(item=price_trends_item(), response=response)
        price_trend_node = il.nested_xpath(
            '//*[div="Price Trends"]/parent::div/following-sibling::div[1]')
        price_trend_node.add_xpath('item1', './*[3]//text()')
        price_trend_node.add_xpath('item2', './*[4]//text()')
        price_trend_node.add_xpath('item3', './*[5]//text()')
        price_trends_dict = il.load_item()
        price_trends = '\n'.join(list(price_trends_dict.values()))

        # local common
        total_reviews = []
        reviews = []
        review_count = response.xpath(
            'count(//div[@data-testid="wls-responisve-slider"]/div/div/child::node())'
        ).extract()[0]
        review_count = int(float(review_count))
        for i in range(1, 1 + review_count):
            reviews.append(' '.join(
                response.xpath(
                    '//div[@data-testid="wls-responisve-slider"]/div/div/*[{:d}]//text()'
                    .format(i)).extract()))
        reviews = '\n'.join(reviews)
        common_count = response.xpath(
            'count(//div[@data-testid="what-locals-say"]/child::node())'
        ).extract()[0]
        common_count = int(float(common_count))
        for i in range(1, common_count):
            total_reviews.append(' '.join(
                response.xpath(
                    '//div[@data-testid="what-locals-say"]/*[{:d}]//text()'.
                    format(i)).extract()))
        total_reviews.append(reviews)

        #similar_house
        base_xpath = '//*[div="Similar Homes You May Like"]/parent::div/following-sibling::div[1]/div/div'
        similar_house = self.get_similar_new_part(base_xpath, response)

        # new linking house
        base_xpath = '//div[contains(text(), "New Listings near")]/../../following-sibling::div[1]/div/div'
        new_link_house = self.get_similar_new_part(base_xpath, response)

        # all new homes
        builder_tr_count = response.xpath(
            'count(//table[@data-testid="quick-movein-builder-homes-table"]//tr)'
        ).extract()[0]
        builder_tr_count = int(float(builder_tr_count))
        builder_tables = []
        for i in range(1, 1 + builder_tr_count):
            builder_tables.append(
                response.xpath(
                    '//table[@data-testid="quick-movein-builder-homes-table"]//tr[{:d}]/td//text()'
                    .format(i)).extract())

        builder_plans = []
        for i in range(1, 1 + builder_tr_count):
            builder_plans.append(
                response.xpath(
                    '//table[@data-testid="planned-builder-homes-table"]//tr[{:d}]/td//text()'
                    .format(i)).extract())

        new_homes = {}
        if len(builder_tables) > 0:
            new_homes['quick-movein-builder'] = builder_tables
        if len(builder_plans) > 0:
            new_homes['planned-builder'] = builder_plans

        il = ItemLoader(item=TruliaItem(), response=response)
        # home detail
        il.add_xpath(
            'home_detail',
            '//div[contains(text(), "Home Details for")]/../../following-sibling::ul/li//text()'
        )

        # description
        il.add_xpath(
            'description',
            '(//*[div="Description"]/parent::div)[2]/following-sibling::div//text()'
        )

        il.add_xpath(
            'community_description',
            '//div[@data-testid="community-description-text-description-text"]//text()'
        )

        il.add_xpath('office_hours',
                     '//div[@data-testid="office-hours-container"]//text()')

        il.add_xpath('open_house',
                     '//div[@data-testid="open-house-container"]//text()')

        # local_commons

        item = il.load_item()

        # price_history may not exist
        try:
            dates = [
                datetime.datetime.strptime(date, '%m/%d/%Y')
                for date in price_dict['dates']
            ]
            prices = [
                int(price.lstrip('$').replace(',', ''))
                for price in price_dict['prices']
            ]
            item['price_history'] = sorted(list(
                zip(dates, prices, price_dict['events'])),
                                           key=lambda x: x[0])
        except:
            item['price_history'] = []

        # overview
        item['overview'] = overview_dict

        # property_tax may not exist
        item['property_taxes'] = tax_dict

        #local_view
        item['local_information'] = local_dict_values

        # price_trends
        item['price_trends'] = price_trends

        # comparable_sales
        item['comparable_sales'] = comparable_list

        # local_commons
        item['local_commons'] = total_reviews

        # similar house
        item['similar_homes'] = similar_house

        # new_link house
        item['new_listing'] = new_link_house

        # new homes
        item['new_homes'] = new_homes
        return item
Esempio n. 11
0
	<footer>
		<a class="social" href="http://facebook.com/whatever">Like Us</a>
		<a class="social" href="http://twitter.com/whatever">Follow Us</a>
		<a class="email" href="mailto:[email protected]">Email Us</a>
	</footer>

	not nested
		loader = ItemLoader(item=Item())
		# load stuff not in the footer
		loader.add_xpath('social', '//footer/a[@class = "social"]/@href')
		loader.add_xpath('email', '//footer/a[@class = "email"]/@href')
		loader.load_item()
	nested
		loader = ItemLoader(item=Item())
		# load stuff not in the footer
		footer_loader = loader.nested_xpath('//footer')
		footer_loader.add_xpath('social', 'a[@class = "social"]/@href')
		footer_loader.add_xpath('email', 'a[@class = "email"]/@href')
		# no need to call footer_loader.load_item()
		loader.load_item()
9.Reusing and extending Item Loaders
	eg:remove 3 dashes ---Plasma TV---
	from scrapy.loader.processors import MapCompose
	from myproject.ItemLoaders import ProductLoader
	def strip_dashes(x):
		return x.strip('-')
	class SiteSpecificLoader(ProductLoader):
    	name_in = MapCompose(strip_dashes, ProductLoader.name_in)
10.Available built-in processors
	class scrapy.loader.processors.Identity
		The simplest processor, which doesn’t do anything.
Esempio n. 12
0
    def parse_restaurant(self, response):
        loader = ItemLoader(item=RestaurantItem(source=self.name,
                                                language='en',
                                                last_update=int(time.time())),
                            response=response)

        loader.default_input_processor = Compose(
            MapCompose(lambda x: x.strip() or None))
        loader.default_output_processor = TakeFirst()

        url = url_query_cleaner(response.url)
        loader.add_value('url', url)

        id = urllib.unquote(urlparse.urlparse(url).path.split('/')[-1])
        loader.add_value('id', id)

        loader.add_xpath(
            'name',
            '//div[contains(@class, "biz-page-header")]//h1[contains(@class, "biz-page-title")]/text()'
        )

        loader.address_out = Join(' - ')
        loader.add_xpath('address',
                         "//div[contains(@class, 'map-box-address')]//text()")

        loader.add_xpath('geolocation', "//div[@class='mapbox-map']//img/@src",
                         MapCompose(lambda url: parse_qs(url).get('center')))

        loader.add_xpath(
            'phone_number',
            "//div[@class='mapbox-text']//span[@class='biz-phone']/text()")

        hours_loader = loader.nested_xpath(
            "//div[contains(@class, 'biz-hours')]//tr/th[@scope]/..")
        hours_loader.opening_hours_in = Compose(group_items(3))
        hours_loader.opening_hours_out = Identity()
        hours_loader.add_xpath(
            'opening_hours', './th/text() | ./td/span[@class="nowrap"]/text()')

        loader.add_xpath(
            'rating',
            '//div[contains(@class, "biz-page-header")]//div[contains(@class, "biz-rating")]/div[contains(@class, "i-stars")]/@title',
            re=r'(?:\D*)(\d+(?:\.\d+)?)')

        loader.number_of_reviews_in = MapCompose(int)
        loader.add_xpath(
            'number_of_reviews',
            '//div[contains(@class, "biz-page-header")]//span[contains(@class, "review-count")]/text()',
            re=r'^\D*(\d+)')

        info_loader = loader.nested_xpath(
            '//div[contains(@class, "sidebar")]//div[@class="ywidget"]/ul[@class="ylist"]/li/div[contains(@class, "short-def-list")]/dl'
        )
        info_loader.info_in = Compose(MapCompose(unicode.strip),
                                      group_items(2))
        info_loader.info_out = Identity()
        info_loader.add_xpath(
            'info', './dt[@class="attribute-key"]/text() | ./dd/text()')

        item = loader.load_item()

        menu_url = TakeFirst()(response.xpath(
            '//h3[@class="menu-preview-heading"]/a/@href').extract())

        if menu_url:
            yield scrapy.Request(response.urljoin(menu_url),
                                 callback=self.parse_menu,
                                 meta={'item': item})
        else:
            yield item