Example #1
0
    def parse(self, response: Response, **kwargs):
        for script in response.xpath('//script/text()').getall():
            # Look for the specific script tag we want
            if 'INITIAL_STATE' in script:
                # Extract the interesting part from the script tag
                m = re.match(r'window\.INITIAL_STATE\s+=\s+({[\s\S]+});',
                             script)

                # Decode it properly, handling annoying unicode escapes and nonsense from the site renderer
                custom_demjson = CustomJSON(json_options=demjson.json_options(
                    compactly=False))
                decoded = custom_demjson.decode(m.group(1),
                                                encoding='unicode-escape')

                # Write a proper valid JSON file out
                # with open('example.json', 'w', encoding='utf-8') as file:
                #     file.write(custom_demjson.encode(decoded))

                raw_data = decoded['searchData']
                word = Word.from_raw(data=raw_data)

                urls = word.get_urls()
                new = urls - self.queue
                self.queue.update(new)

                if len(new) > 0:
                    print(f'Found {len(new)} more URLs.')
                return response.follow_all(new)
Example #2
0
    def parse(self, response: Response):
        yield from response.follow_all(
            xpath='//*[starts-with(@id, "item_")]/div[1]/a/@href',
            callback=self.parse_item,
        )

        link = response.xpath(
            '//*[@id="navigation-bar-bottom"]/div[2]/ul/'
            'li[contains(@class, "next-page")]/a/@href').get()

        yield response.follow(link, callback=self.parse)
    def parse(self, response: Response, **kwargs):
        if self.url_to_crawl:
            yield response.follow(url=self.url_to_crawl, callback=self.parse_residences)
        else:
            residences = response.xpath("//a[contains(@class,'detalii-proprietate')][contains(.,'Vezi detalii')]/@href").getall()
            residences = list(set(residences))

            yield from response.follow_all(urls=residences, callback=self.parse_residences)

            next_page = response.xpath("//a[@class='inainte butonpaginare']/@href").get()
            if next_page:
                yield response.follow(url=next_page, callback=self.parse)