Ejemplo n.º 1
0
 def parse_hotel(
     self, response: scrapy.http.response.html.HtmlResponse, name, text_eng, text_hr
 ):
     if text_hr is None:
         text_hr = "".join(
             response.xpath("//dd[@class='hotel-description']//text()").extract()
         )
         new_link = response._get_url().replace("esky.hr/hoteli", "esky.com/hotels")
         return scrapy.Request(
             new_link,
             callback=self.parse_hotel,
             cb_kwargs={"name": name, "text_eng": None, "text_hr": text_hr},
         )
     else:
         text_eng = "".join(
             response.xpath("//dd[@class='hotel-description']//text()").extract()
         )
         if text_hr != text_eng:
             return {
                 "name": name,
                 "text_eng": text_eng.strip(),
                 "text_hr": text_hr.strip(),
             }
         else:
             return None
    def parse_cinema(self,
                     response: scrapy.http.response.html.HtmlResponse) -> dict:

        titles = self.get_titles(response)
        movies_times = list()
        for movie in response.xpath(
                '//div[@class="table-responsive-wrapper"]'):
            times = list()
            for showtime in movie.xpath('.//tr'):
                times += showtime.xpath('./td/text()').getall()
            movies_times.append(times)

        cinema = Cinema(
            name=self.get_name(response),
            description=self.get_description(response),
            address=Address(street=self.get_street(response),
                            postal_code=self.get_postal_code(response),
                            district=self.get_district(response),
                            city='Berlin',
                            country='Germany'),
            contact=Contact(telephone=self.get_telephone(response)),
            prices=self.get_prices(response),
            shows=self.create_shows(titles, movies_times))

        self.logger.info(f'Scraped cinema: {cinema.name}')

        yield cinema.to_dict()
Ejemplo n.º 3
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse) -> typing.Iterator[dict]:
        # FIXME: when xpath has <1 match, .get() cheerfully returns None.
        # FIXME: when xpath has >1 match, .get() cheerfully returns the first.
        # How do I get exceptions for both cases?
        for quote_etree in response.xpath('//*[@itemscope]'):
            yield {
                'author': quote_etree.xpath('.//*[@itemprop="author"]/text()').get(),
                'text': quote_etree.xpath('.//*[@itemprop="text"]/text()').get(),
                'tags': quote_etree.xpath('.//*[@class="tag"]/text()').getall()}

        # Recursively descend the next page.
        # Follow the "next page" link
        for next_url in response.xpath('//li[@class="next"]/a/@href').getall():
            yield scrapy.Request(
                response.urljoin(next_url),
                callback=self.parse)
Ejemplo n.º 4
0
 def parse_eng(self, response: scrapy.http.response.html.HtmlResponse,
               text_hr: str):
     text_eng = "".join(
         response.xpath(
             "//dd[@class='hotel-description']//text()").extract())
     if text_hr != text_eng:
         with open("c_output_hr.txt", "a", encoding="utf-8") as f:
             f.write(text_hr.replace("\t", "").replace("\n", ""))
         with open("c_output_en.txt", "a", encoding="utf-8") as f:
             f.write(text_eng.replace("\t", "").replace("\n", ""))
Ejemplo n.º 5
0
 def parse(self, response: scrapy.http.response.html.HtmlResponse):
     text_hr = "".join(
         response.xpath(
             "//dd[@class='hotel-description']//text()").extract())
     if text_hr == "":
         return None
     new_link = response._get_url().replace("esky.hr/hoteli",
                                            "esky.com/hotels")
     return scrapy.Request(new_link,
                           callback=self.parse_eng,
                           cb_kwargs={"text_hr": text_hr})
Ejemplo n.º 6
0
    def parse(self, response: scrapy.http.response.html.HtmlResponse):

        selectors = response.xpath('//div[@class="controls"]/select/option')

        # current movies:
        base_kinodetail_url = 'https://www.berlin.de/kino/_bin/kinodetail.php/'

        hrefs = [
            base_kinodetail_url + sel.attrib['value'] for sel in selectors
            if is_positiveinteger(sel.attrib['value'])
        ]

        for href in hrefs:
            self.logger.info(f'Scraping: {href}')
            yield response.follow(href, self.parse_cinema)
 def get_telephone(self,
                   response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath(
         '//span[contains(text(), "Telefon")]/following-sibling::span/text()'
     ).get()
 def get_district(self,
                  response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath('//span[@class="locality"]/text()').get()
 def get_postal_code(
         self, response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath('//span[@class="postal-code"]/text()').get()
 def get_street(self,
                response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath('//span[@class="street-address"]/text()').get()
 def get_description(
         self, response: scrapy.http.response.html.HtmlResponse) -> str:
     return response.xpath('//div[@class="kinodetail echo"]/p/text()').get()
 def get_prices(
         self,
         response: scrapy.http.response.html.HtmlResponse) -> List[str]:
     return response.xpath(
         '//section[@class="infoblock oeffnungszeiten"]/div/*/text()'
     ).getall()
Ejemplo n.º 13
0
    def analyze_website(self, website_url: str,
                        response: scrapy.http.response.html.HtmlResponse):
        gallery_page_matched_by_url = False
        page_with_gallery_matched_by_url = False
        can_be_page_with_gallery = True

        number_of_imgs_matched_by_a_href = 0
        number_of_images_by_img_src = 0

        img_src_values = response.xpath("//img/@src").extract()
        a_href_values = response.xpath("//a/@href").extract()

        some_config_matched = False
        for config in self.configs:
            if not re.match(f'.*{config.domain}.*', website_url):
                continue
            some_config_matched = True
            if config.pageWithGalleryUrlMatchesRegexp:
                rexp_match = re.match(config.pageWithGalleryUrlMatchesRegexp,
                                      website_url)
                page_with_gallery_matched_by_url = page_with_gallery_matched_by_url or rexp_match is not None

            if config.galleryUrlMatchesRegexp:
                rexp_match = re.match(config.galleryUrlMatchesRegexp,
                                      website_url)
                gallery_page_matched_by_url = gallery_page_matched_by_url or rexp_match is not None

            if config.pageWithGalleryUrlHasToMatchRegexp:
                rexp_match = re.match(
                    config.pageWithGalleryUrlHasToMatchRegexp, website_url)
                can_be_page_with_gallery = rexp_match is not None

            if config.pageWithGalleryContainsImgSrcRegexp:
                for img_src_val in img_src_values:
                    if re.match(config.pageWithGalleryContainsImgSrcRegexp,
                                img_src_val):
                        number_of_images_by_img_src += 1

            if config.pageWithGalleryContainsAnchorHrefRegexp:
                for a_href_val in a_href_values:
                    if re.match(config.pageWithGalleryContainsAnchorHrefRegexp,
                                a_href_val):
                        number_of_imgs_matched_by_a_href += 1

        has_match_for_page_with_gallery_by_imgs = number_of_imgs_matched_by_a_href >= 1 \
            or number_of_images_by_img_src >= 1

        has_match_for_page_with_gallery_by_imgs = has_match_for_page_with_gallery_by_imgs \
            and can_be_page_with_gallery

        has_match = page_with_gallery_matched_by_url \
            or gallery_page_matched_by_url \
            or has_match_for_page_with_gallery_by_imgs

        page_analysis_results = PageAnalysisResults()

        page_analysis_results[
            'number_of_images_by_a_href'] = number_of_imgs_matched_by_a_href
        page_analysis_results[
            'number_of_images_by_img_src'] = number_of_images_by_img_src
        page_analysis_results['has_match'] = has_match
        page_analysis_results[
            'url_matched_for_gallery_page'] = gallery_page_matched_by_url
        page_analysis_results[
            'url_matched_for_page_with_gallery'] = page_with_gallery_matched_by_url

        return page_analysis_results