def parse(self, response): agent = user_agent_rotator.get_random_user_agent() options.add_argument(f"user-agent={agent}") self.driver = webdriver.Chrome(str(Path(Path.cwd(), "chromedriver.exe")), chrome_options=options) # self.driver = webdriver.Firefox(executable_path=str(Path(Path.cwd(), "geckodriver.exe"))) self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900)) self.driver.get( "https://www.kyero.com/en/majorca-apartments-for-sale-0l55563g1?max_price=150000&min_beds=2&min_property_size=40&sort=popularity_desc/" ) sleep(2) body = self.driver.find_element_by_css_selector('body') body.send_keys(Keys.PAGE_DOWN) sleep(1) body.send_keys(Keys.PAGE_UP) sleep(1) body.send_keys(Keys.PAGE_DOWN) body.send_keys(Keys.HOME) sel = Selector(text=self.driver.page_source) pages = sel.xpath( './/span[@class="search-results__count"]/text()').extract()[0] pages = pages.split(" ")[0] pages = pages.replace(",", "") pages = int(pages) / 20 pages_count = int(pages) + 1 sleep(1) self.driver.quit() for page in range(pages_count): agent = user_agent_rotator.get_random_user_agent() options.add_argument(f"user-agent={agent}") self.driver = webdriver.Chrome(str( Path(Path.cwd(), "chromedriver.exe")), chrome_options=options) self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900)) self.driver.get( f"https://www.kyero.com/en/majorca-apartments-for-sale-0l55563g1?max_price=150000&min_beds=2&min_property_size=40&page={page}&sort=popularity_desc" ) sleep(1) body = self.driver.find_element_by_css_selector('body') sleep(1) body.send_keys(Keys.END) sleep(1) body.send_keys(Keys.HOME) try: picture = self.driver.find_elements_by_css_selector('figure')[ randrange(1, 5)] hov = ActionChains(driver).move_to_element(picture) hov.perform() except: pass sel = Selector(text=self.driver.page_source) adverts = sel.xpath('//article[contains(@class, "bg-white")]') for advert in adverts: try: l = ItemLoader(item=IslandScraperItem(), selector=advert) title = advert.xpath( './/a[contains(@class, "inline-block hover-underline")]/text()' ).extract_first() link_string = advert.xpath( './/a[contains(@class, "inline-block hover-underline")]/@href' ).extract_first() link = "https://www.kyero.com" + link_string locality = title.split(" in ")[1] details = advert.xpath( './/ul[contains(@class, "flex")]/li/span/text()') price_string = advert.xpath( './/span[contains(@class, "p-5")]/text()' ).extract_first()[1:] price = price_string.replace(",", "") beds = advert.xpath( './/ul[contains(@class, "flex")]/li/span/text()' ).extract_first() size_string = advert.xpath( './/ul[@class="flex"]/li/span/text()')[-1].extract() size = size_string.split(" ")[0] date = datetime.today().strftime('%Y-%m-%d') except: pass l.add_value('title', title) l.add_value('island', "Mallorca") l.add_value('locality', locality) l.add_value('price', price) l.add_value('beds', beds) l.add_value('size', size) l.add_value('link', link) l.add_value('date', date) l.add_value('ad_type', "sale") yield l.load_item() sleep(5) self.driver.quit() self.driver.quit()
def parse(self, response): agent = user_agent_rotator.get_random_user_agent() options.add_argument(f"user-agent={agent}") self.driver = webdriver.Chrome(str(Path(Path.cwd(), "chromedriver.exe")), chrome_options=options) self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900)) self.driver.get( "https://www.idealista.com/en/alquiler-viviendas/las-palmas/gran-canaria/con-metros-cuadrados-mas-de_40,metros-cuadrados-menos-de_100,pisos,de-dos-dormitorios,de-tres-dormitorios,de-cuatro-cinco-habitaciones-o-mas,ultimas-plantas,plantas-intermedias/" ) sleep(2) body = self.driver.find_element_by_css_selector('body') body.send_keys(Keys.PAGE_DOWN) sleep(1) body.send_keys(Keys.PAGE_UP) sleep(1) body.send_keys(Keys.PAGE_DOWN) body.send_keys(Keys.HOME) sel = Selector(text=self.driver.page_source) pages = sel.xpath( './/span[@class="breadcrumb-info"]/text()').extract()[1] pages = pages.replace(",", "").split(" ")[0] pages = int(pages) / 30 pages_count = int(pages) + 1 self.driver.quit() for page in range(pages_count): self.driver = webdriver.Chrome(str( Path(Path.cwd(), "chromedriver.exe")), chrome_options=options) # self.driver = webdriver.Firefox(executable_path=str(Path(Path.cwd(), "geckodriver.exe"))) self.driver.set_window_size(randrange(1100, 1200), randrange(800, 900)) self.driver.get( f"https://www.idealista.com/en/alquiler-viviendas/las-palmas/gran-canaria/con-metros-cuadrados-mas-de_40,metros-cuadrados-menos-de_100,pisos,de-dos-dormitorios,de-tres-dormitorios,de-cuatro-cinco-habitaciones-o-mas,ultimas-plantas,plantas-intermedias/pagina-{page}.htm" ) sleep(1) body = self.driver.find_element_by_css_selector('body') sleep(1) body.send_keys(Keys.END) sleep(1) body.send_keys(Keys.HOME) try: picture = self.driver.find_elements_by_css_selector('picture')[ randrange(1, 5)] hov = ActionChains(driver).move_to_element(picture) hov.perform() except: pass sel = Selector(text=self.driver.page_source) adverts = sel.xpath('//article[contains(@class, "item")]') for advert in adverts: try: l = ItemLoader(item=IslandScraperItem(), selector=advert) title = advert.xpath( './/a[contains(@class, "item-link")]/@title' ).extract_first() link_string = advert.xpath( './/a[contains(@class, "item-link")]/@href' ).extract_first() link = "https://www.idealista.com" + link_string address = title.split(" in ")[1] address_list = address.split(", ") locality = address_list[-1] area = "" if len(address_list) > 1: area = address.split(", ")[-2] price_string = advert.xpath( './/span[contains(@class, "item-price")]/text()' ).extract_first() price = price_string.replace(",", "") beds_string = advert.xpath( './/span[contains(@class, "item-detail")]/text()' ).extract_first() beds = beds_string.strip() size_string = advert.xpath( './/span[contains(@class, "item-detail")]/text()' )[1].extract() size = size_string.strip() try: floor_string = advert.xpath( './/span[contains(@class, "item-detail")]/text()' )[2].extract() floor = floor_string.replace("Floor", "").strip() except: floor = "1" date = datetime.today().strftime('%Y-%m-%d') except: pass l.add_value('title', title) l.add_value('island', "Gran Canaria") l.add_value('locality', locality) l.add_value('price', price) l.add_value('beds', beds) l.add_value('size', size) l.add_value('link', link) l.add_value('date', date) l.add_value('ad_type', "rent") yield l.load_item() sleep(1) self.driver.quit()