Beispiel #1
0
    def parse_page(self, response):
        """crawl properties on a page"""

        # handle "Partner-Anzeigen"
        # need to identify correct css/xpath
        #print len(response.xpath("//div[text() = 'Partner-Anzeige']"))
        #print len(response.css("div #ResultListData > ul.alist > li[data-ssp]"))
        #from scrapy.shell import inspect_response
        #inspect_response(response, self)
        for box in response.css("div #ResultListData > ul > li[data-ssp]"):
            loader = PropertyLoader(item=PropertyItem(), response=response)
            #loader.add_css("header", box.css("h3::text"))
            loader.add_value("advertiser_id", "Immobilienscout24")
            loader.add_value("commercial", response.meta.get("commercial"))
            loader.add_value("property_type", response.meta.get("property_type"))
            loader.add_value("city_category", response.meta.get("city_category"))  # stats

            item = loader.load_item()
            yield item

        # handle non-"Partner-Anzeigen"
        for sel in response.css("div #ResultListData > ul > li.hlisting  > div.n2 > a::attr(\"href\")"):
            url = add_scheme_host(sel.extract())
            yield scrapy.Request(url, self.parse_property, meta=response.meta)
Beispiel #2
0
    def parse_property(self, response):
        loader = PropertyLoader(item=PropertyItem(), response=response)

        loader.add_css("header", "div.headline > h2::text")
        loader.add_css("description", "div.text::text")
        loader.add_css("price", "div.price strong span::text")
        loader.add_css("postal_code", "div.location span.address span.postal-code::text")
        loader.add_css("city", "div.location span.address span.locality::text")
        loader.add_css("obid", "div.date-and-clicks > strong:nth-child(1)")
        loader.add_css("ad_created", "div.date-and-clicks::text")
        loader.add_css("phone", "ul.contacts > li > span:nth-child(2)::text")
        loader.add_value("created", date.today())
        loader.add_value("url", response.url)
        loader.add_value("commercial", response.meta.get("commercial"))
        loader.add_value("property_type", response.meta.get("property_type"))
        loader.add_value("city_category", response.meta.get("city_category"))  # stats

        item = loader.load_item()
        return item