def parseDetail(self, response):
        url = response.request.url
        thing_to_do = self.validateStr(
            response.xpath("//h1[@id='HEADING']/text()").extract_first())

        addr_list = response.xpath(
            "//div[@class='detail_section address']/span/text()").extract()
        address = ''
        for addr in addr_list:
            address = address + self.validateStr(addr) + ' '
        address = self.validateStr(address)
        city = self.validateCityUrl(
            response.xpath("//span[@class='locality']/text()").extract_first())

        phone_list = response.xpath(
            "//div[@class='blEntry phone']/span/text()").extract()
        phone_number = ''
        for phone in phone_list:
            phone_number = phone_number + self.validateStr(phone) + ' '
        phone_number = self.validateStr(phone_number)
        email = ''
        num_review = self.validateStr(
            response.xpath("//span[@property='count']/text()").extract_first())
        num_rating = self.validateStr(
            response.xpath(
                "//span[@class='overallRating']/text()").extract_first())

        is_bookable = False
        activities = response.xpath("//div[@class='ui_link']")
        if activities:
            is_bookable = True
            for activity in activities:
                label = self.validateStr(
                    activity.xpath(
                        ".//div[@class='MultiTourOffer__title_container--3SBSu']//span[@class='MultiTourOffer__title--4PROg']/text()"
                    ).extract_first())
                price = self.validateStr(
                    activity.xpath(
                        ".//div[@class='MultiTourOffer__price_container--1yJni']//span[@class='fromPrice']/text()"
                    ).extract_first())
                item = TripadvisorItem()
                item['url'] = url
                item['thing_to_do'] = thing_to_do
                item['address'] = address
                item['city'] = city
                item['phone_number'] = phone_number
                item['email'] = ''
                item['num_review'] = num_review
                item['num_rating'] = num_rating
                item['is_bookable'] = is_bookable
                item['activity'] = label
                item['price'] = price
                yield item

                # show more button actions

        else:
            item = TripadvisorItem()
            item['url'] = url
            item['thing_to_do'] = thing_to_do
            item['address'] = address
            item['city'] = city
            item['phone_number'] = phone_number
            item['email'] = ''
            item['num_review'] = num_review
            item['num_rating'] = num_rating
            item['is_bookable'] = is_bookable
            item['activity'] = ''
            item['price'] = ''
            yield item
Example #2
0
    def parse_each_attraction(self, response):
        sel = Selector(response)
        # print("********")
        # print(response.url)
        # print("********")

        trip_item = TripadvisorItem()
        title = response.xpath("//h1[@id='HEADING']/text()").extract()
        place_title = ""
        for t in title:
            place_title += t
        trip_item['PlaceTitle'] = place_title.strip()

        trip_item['PlaceURL'] = response.url

        if len(response.xpath('//a[@class="more"]/text()').extract()) > 0:
            total_reviews = response.xpath(
                '//a[@class="more"]/text()').extract()[0].split("R")[0]
            trip_item['TotalReviews'] = total_reviews
        else:
            trip_item['TotalReviews'] = ""
        # total_reviews = response.xpath('//a[@class="more"]/text()').extract()[0].split("R")[0]

        #
        if len(response.xpath("//div[@class='separator']").extract()) > 0:
            placeCatDiv = response.xpath("//div[@class='separator']").extract()
            placeCatSel = Selector(text=placeCatDiv[0])
            trip_item['PlaceCategory'] = placeCatSel.xpath(
                "string(//div[1])").extract_first().strip()
        else:
            trip_item['PlaceCategory'] = ""

        trip_item['StreetAddress'] = response.xpath(
            '//span[@class="street-address"]/text()').extract(
            ) + response.xpath(
                'span[@class="extended-address"]/text()').extract()
        trip_item['AddressLocality'] = response.xpath(
            '//span[@class="locality"]/span[@property="addressLocality"]/text()'
        ).extract()
        trip_item['AddressRegion'] = response.xpath(
            '//span[@class="locality"]/span[@property="addressRegion"]/text()'
        ).extract()
        trip_item['PostCode'] = response.xpath(
            '//span[@class="locality"]/span[@property="postalCode"]/text()'
        ).extract()
        phone_number = response.xpath(
            '//div[@class="phoneNumber"]/text()').extract()
        if phone_number:
            trip_item['PhoneNumber'] = phone_number[0].split(":")[1].strip()

        #
        #
        div = response.xpath("//div[@class='slim_ranking']").extract()
        if div:
            sel = Selector(text=div[0])
            trip_item['Ranking'] = sel.xpath(
                "string(//div[1])").extract_first().strip()
        else:
            trip_item['Ranking'] = ""

        len_fee = response.xpath(
            "//div[@class='details_wrapper']/div[@class='detail']/text()"
        ).extract()

        if len_fee:
            trip_item['LengthOfVisit'] = len_fee[-3].strip()
            trip_item['Fee'] = len_fee[-1].strip()
        else:
            trip_item['LengthOfVisit'] = ""
            trip_item['Fee'] = ""

        trip_item['Description'] = response.xpath(
            "//div[@class='listing_details']/p/text()").extract()
        # trip_item['AverageRating'] = response.xpath("//div[@class='valueCount fr part']/text()")[2].extract()
        rating = response.xpath(
            "//div[@class='valueCount fr part']/text()").extract()
        if rating:
            trip_item['AverageRating'] = rating[2]
        else:
            trip_item['AverageRating'] = ""

        email_div = response.xpath(
            "//div[@class='taLnk fl']/@onclick").extract()

        if len(email_div) > 0:
            email = email_div[0].split(",")
            trip_item['Email'] = email[6].strip("/'")
        else:
            trip_item['Email'] = ""

        days = [
            d.strip()
            for d in response.xpath("//span[@class='days']/text()").extract()
            if d
        ]
        hours = [
            h.strip()
            for h in response.xpath("//span[@class='hours']/text()").extract()
            if h
        ]
        days_hours = dict(zip(days, hours))
        trip_item['OpeningHours'] = ""
        for day, hour in days_hours.items():
            trip_item['OpeningHours'] += day + " " + hour + " , "

        yield trip_item