Ejemplo n.º 1
0
    def parse_car_page(self, response):

        item = CarItem()

        item["url"] = response.url
        item["finnCode"] = SpiderHelper.getCodeFromRawUrl(response.url)
        item["crawlTime"] = datetime.datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S") 

        l = CarItemLoader(item = item, response = response)
        
        l.add_xpath("model", '//h1/text()', MapCompose(SpiderHelper.normalizeOneWordValue))
        l.add_xpath("title", '//h1/following-sibling::p[1]/text()')
        l.add_xpath("price", '//div[@data-automation-id = "key" and text() = "Totalpris"]/following-sibling::div[@data-automation-id = "value"]/text()', MapCompose(SpiderHelper.normalizeNumber))

        numberFields = {  "Omregistrering": "purcharseFee",\
                          "rsmodell": "modelYear",\
                          "Antall eiere": "previousOwners",\
                          "Km.stand": "mileage"}

        textFields = { "1. gang registrert":"firstTimeRegister" , \
                          "rsavgift":"annualfeeincluded", \
                          "Girkasse":"gearType",  \
                          "Drivstoff":"fuelType",  \
                          "Salgsform":"saleForm",  \
                          "Reg.nr.":"RegNumber"  }

        xpathTemplate = "//h1/following-sibling::dl/dt[@data-automation-id='key' and contains(text(), '{0}')]/following-sibling::dd[1]/text()"
        

        for k, v in numberFields.items():
            xpath = xpathTemplate.format(k)
            l.add_xpath(v, xpath, MapCompose(SpiderHelper.normalizeNumber)) 
        for k, v in textFields.items():
            xpath = xpathTemplate.format(k)
            l.add_xpath(v, xpath)


        EUcheckURL = response.xpath('//a[text() = "Sjekk tid for neste EU-kontroll"]/@href').extract()

#TODO: cannot pass capcha check, ignore EUCheckULR now
        EUcheckURL = None
#End of TODO
        if not EUcheckURL:
            #do nothing, return that we have
            yield l.load_item()
        else:
            #get extra EU control info, passing the item loader to that request, and let the new parser handle and return the item 
            yield scrapy.Request(EUcheckURL[0], self.parse_EUControl_Info, meta={'loader':l}) 
Ejemplo n.º 2
0
    def parse_realEstate_page(self, response):
        item = RealEstateItem()

        item["url"] = response.url
        item["finnCode"] = SpiderHelper.getCodeFromRawUrl(response.url)
        item["isNewBuilding"] = "newbuildings" in response.url
        item["crawlTime"] = datetime.datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S") 

        l = RealEstateItemLoader(item = item, response = response)

        l.add_xpath("title", '//h1/text()')
        l.add_xpath("address", '//h1/following-sibling::p[1]/text()')
        
        l.add_xpath("askingPrice", '//div[@data-automation-id = "key" and text() = "Pris"]/following-sibling::div[@data-automation-id = "value"]/text()', MapCompose(SpiderHelper.normalizeNumber))
        l.add_xpath("askingPrice", '//h1/following-sibling::dl[1]/dd/text()', MapCompose(SpiderHelper.normalizeNumber))

        numberFields = {  "Verditakst"    : "verditakst",\
                          "netakst"       : "laanetakst",\
                          "Fellesformue"  : "fellesformue",\
                          "Felleskost"    : "felleskost",\
                          u"Prim"         : "primaerrom",\
                          "Bruksareal"    : "bruksareal",\
                          "Bruttoareal"   : "bruttoareal",\
                          "Tomteareal"    : "tomteareal",\
                          "Rom"           : "rom", \
                          "Bygge"         : "byggeaar", \
                          "Etasje"        : "etasje", \
                          "Soverom"       : "soverom"}

        oneWordTextFields = {    "Boligtype"     : "boligtype", \
                          "Energimerking" : "energimerking", \
                          "Eieform"       : "eieform" }

        xpathTemplate = "//h1/following-sibling::dl/dt[@data-automation-id='key' and contains(text(), '{0}')]/following-sibling::dd[1]/text()"
        
        for k, v in numberFields.items():
            xpath = xpathTemplate.format(k)
            l.add_xpath(v, xpath, MapCompose(SpiderHelper.normalizeNumber)) 
        for k, v in oneWordTextFields.items():
            xpath = xpathTemplate.format(k)
            l.add_xpath(v, xpath, MapCompose(SpiderHelper.normalizeOneWordValue))

        yield l.load_item()