コード例 #1
0
    def parse(self, response):
        agent = user_agent_rotator.get_random_user_agent()
        options.add_argument(f"user-agent={agent}")
        self.driver = webdriver.Chrome(str(Path(Path.cwd(),
                                                "chromedriver.exe")),
                                       chrome_options=options)
        # self.driver = webdriver.Firefox(executable_path=str(Path(Path.cwd(), "geckodriver.exe")))
        self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900))
        self.driver.get(
            "https://www.kyero.com/en/majorca-apartments-for-sale-0l55563g1?max_price=150000&min_beds=2&min_property_size=40&sort=popularity_desc/"
        )
        sleep(2)
        body = self.driver.find_element_by_css_selector('body')
        body.send_keys(Keys.PAGE_DOWN)
        sleep(1)
        body.send_keys(Keys.PAGE_UP)
        sleep(1)
        body.send_keys(Keys.PAGE_DOWN)
        body.send_keys(Keys.HOME)

        sel = Selector(text=self.driver.page_source)

        pages = sel.xpath(
            './/span[@class="search-results__count"]/text()').extract()[0]
        pages = pages.split(" ")[0]
        pages = pages.replace(",", "")
        pages = int(pages) / 20
        pages_count = int(pages) + 1
        sleep(1)
        self.driver.quit()

        for page in range(pages_count):
            agent = user_agent_rotator.get_random_user_agent()
            options.add_argument(f"user-agent={agent}")
            self.driver = webdriver.Chrome(str(
                Path(Path.cwd(), "chromedriver.exe")),
                                           chrome_options=options)
            self.driver.set_window_size(randrange(1100, 1200),
                                        randrange(800, 900))
            self.driver.get(
                f"https://www.kyero.com/en/majorca-apartments-for-sale-0l55563g1?max_price=150000&min_beds=2&min_property_size=40&page={page}&sort=popularity_desc"
            )
            sleep(1)
            body = self.driver.find_element_by_css_selector('body')
            sleep(1)
            body.send_keys(Keys.END)
            sleep(1)
            body.send_keys(Keys.HOME)

            try:
                picture = self.driver.find_elements_by_css_selector('figure')[
                    randrange(1, 5)]
                hov = ActionChains(driver).move_to_element(picture)
                hov.perform()
            except:
                pass

            sel = Selector(text=self.driver.page_source)
            adverts = sel.xpath('//article[contains(@class, "bg-white")]')

            for advert in adverts:
                try:
                    l = ItemLoader(item=IslandScraperItem(), selector=advert)
                    title = advert.xpath(
                        './/a[contains(@class, "inline-block hover-underline")]/text()'
                    ).extract_first()
                    link_string = advert.xpath(
                        './/a[contains(@class, "inline-block hover-underline")]/@href'
                    ).extract_first()
                    link = "https://www.kyero.com" + link_string
                    locality = title.split(" in ")[1]
                    details = advert.xpath(
                        './/ul[contains(@class, "flex")]/li/span/text()')
                    price_string = advert.xpath(
                        './/span[contains(@class, "p-5")]/text()'
                    ).extract_first()[1:]
                    price = price_string.replace(",", "")
                    beds = advert.xpath(
                        './/ul[contains(@class, "flex")]/li/span/text()'
                    ).extract_first()
                    size_string = advert.xpath(
                        './/ul[@class="flex"]/li/span/text()')[-1].extract()
                    size = size_string.split(" ")[0]
                    date = datetime.today().strftime('%Y-%m-%d')

                except:
                    pass

                l.add_value('title', title)
                l.add_value('island', "Mallorca")
                l.add_value('locality', locality)
                l.add_value('price', price)
                l.add_value('beds', beds)
                l.add_value('size', size)
                l.add_value('link', link)
                l.add_value('date', date)
                l.add_value('ad_type', "sale")
                yield l.load_item()

            sleep(5)
            self.driver.quit()

        self.driver.quit()
コード例 #2
0
    def parse(self, response):
        agent = user_agent_rotator.get_random_user_agent()
        options.add_argument(f"user-agent={agent}")
        self.driver = webdriver.Chrome(str(Path(Path.cwd(),
                                                "chromedriver.exe")),
                                       chrome_options=options)
        self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900))
        self.driver.get(
            "https://www.idealista.com/en/alquiler-viviendas/las-palmas/gran-canaria/con-metros-cuadrados-mas-de_40,metros-cuadrados-menos-de_100,pisos,de-dos-dormitorios,de-tres-dormitorios,de-cuatro-cinco-habitaciones-o-mas,ultimas-plantas,plantas-intermedias/"
        )
        sleep(2)
        body = self.driver.find_element_by_css_selector('body')
        body.send_keys(Keys.PAGE_DOWN)
        sleep(1)
        body.send_keys(Keys.PAGE_UP)
        sleep(1)
        body.send_keys(Keys.PAGE_DOWN)
        body.send_keys(Keys.HOME)

        sel = Selector(text=self.driver.page_source)

        pages = sel.xpath(
            './/span[@class="breadcrumb-info"]/text()').extract()[1]
        pages = pages.replace(",", "").split(" ")[0]
        pages = int(pages) / 30
        pages_count = int(pages) + 1

        self.driver.quit()

        for page in range(pages_count):
            self.driver = webdriver.Chrome(str(
                Path(Path.cwd(), "chromedriver.exe")),
                                           chrome_options=options)
            # self.driver = webdriver.Firefox(executable_path=str(Path(Path.cwd(), "geckodriver.exe")))
            self.driver.set_window_size(randrange(1100, 1200),
                                        randrange(800, 900))
            self.driver.get(
                f"https://www.idealista.com/en/alquiler-viviendas/las-palmas/gran-canaria/con-metros-cuadrados-mas-de_40,metros-cuadrados-menos-de_100,pisos,de-dos-dormitorios,de-tres-dormitorios,de-cuatro-cinco-habitaciones-o-mas,ultimas-plantas,plantas-intermedias/pagina-{page}.htm"
            )
            sleep(1)
            body = self.driver.find_element_by_css_selector('body')
            sleep(1)
            body.send_keys(Keys.END)
            sleep(1)
            body.send_keys(Keys.HOME)

            try:
                picture = self.driver.find_elements_by_css_selector('picture')[
                    randrange(1, 5)]
                hov = ActionChains(driver).move_to_element(picture)
                hov.perform()
            except:
                pass

            sel = Selector(text=self.driver.page_source)
            adverts = sel.xpath('//article[contains(@class, "item")]')

            for advert in adverts:
                try:
                    l = ItemLoader(item=IslandScraperItem(), selector=advert)
                    title = advert.xpath(
                        './/a[contains(@class, "item-link")]/@title'
                    ).extract_first()
                    link_string = advert.xpath(
                        './/a[contains(@class, "item-link")]/@href'
                    ).extract_first()
                    link = "https://www.idealista.com" + link_string
                    address = title.split(" in ")[1]
                    address_list = address.split(", ")
                    locality = address_list[-1]
                    area = ""
                    if len(address_list) > 1:
                        area = address.split(", ")[-2]
                    price_string = advert.xpath(
                        './/span[contains(@class, "item-price")]/text()'
                    ).extract_first()
                    price = price_string.replace(",", "")
                    beds_string = advert.xpath(
                        './/span[contains(@class, "item-detail")]/text()'
                    ).extract_first()
                    beds = beds_string.strip()
                    size_string = advert.xpath(
                        './/span[contains(@class, "item-detail")]/text()'
                    )[1].extract()
                    size = size_string.strip()
                    try:
                        floor_string = advert.xpath(
                            './/span[contains(@class, "item-detail")]/text()'
                        )[2].extract()
                        floor = floor_string.replace("Floor", "").strip()
                    except:
                        floor = "1"
                    date = datetime.today().strftime('%Y-%m-%d')

                except:
                    pass

                l.add_value('title', title)
                l.add_value('island', "Gran Canaria")
                l.add_value('locality', locality)
                l.add_value('price', price)
                l.add_value('beds', beds)
                l.add_value('size', size)
                l.add_value('link', link)
                l.add_value('date', date)
                l.add_value('ad_type', "rent")
                yield l.load_item()

            sleep(1)
            self.driver.quit()