def parse(self, response: Response, **kwargs): for script in response.xpath('//script/text()').getall(): # Look for the specific script tag we want if 'INITIAL_STATE' in script: # Extract the interesting part from the script tag m = re.match(r'window\.INITIAL_STATE\s+=\s+({[\s\S]+});', script) # Decode it properly, handling annoying unicode escapes and nonsense from the site renderer custom_demjson = CustomJSON(json_options=demjson.json_options( compactly=False)) decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape') # Write a proper valid JSON file out # with open('example.json', 'w', encoding='utf-8') as file: # file.write(custom_demjson.encode(decoded)) raw_data = decoded['searchData'] word = Word.from_raw(data=raw_data) urls = word.get_urls() new = urls - self.queue self.queue.update(new) if len(new) > 0: print(f'Found {len(new)} more URLs.') return response.follow_all(new)
def parse(self, response: Response): yield from response.follow_all( xpath='//*[starts-with(@id, "item_")]/div[1]/a/@href', callback=self.parse_item, ) link = response.xpath( '//*[@id="navigation-bar-bottom"]/div[2]/ul/' 'li[contains(@class, "next-page")]/a/@href').get() yield response.follow(link, callback=self.parse)
def parse(self, response: Response, **kwargs): if self.url_to_crawl: yield response.follow(url=self.url_to_crawl, callback=self.parse_residences) else: residences = response.xpath("//a[contains(@class,'detalii-proprietate')][contains(.,'Vezi detalii')]/@href").getall() residences = list(set(residences)) yield from response.follow_all(urls=residences, callback=self.parse_residences) next_page = response.xpath("//a[@class='inainte butonpaginare']/@href").get() if next_page: yield response.follow(url=next_page, callback=self.parse)