def parse_car_page(self, response): item = CarItem() item["url"] = response.url item["finnCode"] = SpiderHelper.getCodeFromRawUrl(response.url) item["crawlTime"] = datetime.datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S") l = CarItemLoader(item = item, response = response) l.add_xpath("model", '//h1/text()', MapCompose(SpiderHelper.normalizeOneWordValue)) l.add_xpath("title", '//h1/following-sibling::p[1]/text()') l.add_xpath("price", '//div[@data-automation-id = "key" and text() = "Totalpris"]/following-sibling::div[@data-automation-id = "value"]/text()', MapCompose(SpiderHelper.normalizeNumber)) numberFields = { "Omregistrering": "purcharseFee",\ "rsmodell": "modelYear",\ "Antall eiere": "previousOwners",\ "Km.stand": "mileage"} textFields = { "1. gang registrert":"firstTimeRegister" , \ "rsavgift":"annualfeeincluded", \ "Girkasse":"gearType", \ "Drivstoff":"fuelType", \ "Salgsform":"saleForm", \ "Reg.nr.":"RegNumber" } xpathTemplate = "//h1/following-sibling::dl/dt[@data-automation-id='key' and contains(text(), '{0}')]/following-sibling::dd[1]/text()" for k, v in numberFields.items(): xpath = xpathTemplate.format(k) l.add_xpath(v, xpath, MapCompose(SpiderHelper.normalizeNumber)) for k, v in textFields.items(): xpath = xpathTemplate.format(k) l.add_xpath(v, xpath) EUcheckURL = response.xpath('//a[text() = "Sjekk tid for neste EU-kontroll"]/@href').extract() #TODO: cannot pass capcha check, ignore EUCheckULR now EUcheckURL = None #End of TODO if not EUcheckURL: #do nothing, return that we have yield l.load_item() else: #get extra EU control info, passing the item loader to that request, and let the new parser handle and return the item yield scrapy.Request(EUcheckURL[0], self.parse_EUControl_Info, meta={'loader':l})
def parse_realEstate_page(self, response): item = RealEstateItem() item["url"] = response.url item["finnCode"] = SpiderHelper.getCodeFromRawUrl(response.url) item["isNewBuilding"] = "newbuildings" in response.url item["crawlTime"] = datetime.datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S") l = RealEstateItemLoader(item = item, response = response) l.add_xpath("title", '//h1/text()') l.add_xpath("address", '//h1/following-sibling::p[1]/text()') l.add_xpath("askingPrice", '//div[@data-automation-id = "key" and text() = "Pris"]/following-sibling::div[@data-automation-id = "value"]/text()', MapCompose(SpiderHelper.normalizeNumber)) l.add_xpath("askingPrice", '//h1/following-sibling::dl[1]/dd/text()', MapCompose(SpiderHelper.normalizeNumber)) numberFields = { "Verditakst" : "verditakst",\ "netakst" : "laanetakst",\ "Fellesformue" : "fellesformue",\ "Felleskost" : "felleskost",\ u"Prim" : "primaerrom",\ "Bruksareal" : "bruksareal",\ "Bruttoareal" : "bruttoareal",\ "Tomteareal" : "tomteareal",\ "Rom" : "rom", \ "Bygge" : "byggeaar", \ "Etasje" : "etasje", \ "Soverom" : "soverom"} oneWordTextFields = { "Boligtype" : "boligtype", \ "Energimerking" : "energimerking", \ "Eieform" : "eieform" } xpathTemplate = "//h1/following-sibling::dl/dt[@data-automation-id='key' and contains(text(), '{0}')]/following-sibling::dd[1]/text()" for k, v in numberFields.items(): xpath = xpathTemplate.format(k) l.add_xpath(v, xpath, MapCompose(SpiderHelper.normalizeNumber)) for k, v in oneWordTextFields.items(): xpath = xpathTemplate.format(k) l.add_xpath(v, xpath, MapCompose(SpiderHelper.normalizeOneWordValue)) yield l.load_item()