def parseCar(self, response): DESC_XPATH = './/div[contains(@id, \"msg_div_msg\")]/text()' MAKE_XPATH = './/td[contains(@id, \"tdo_31\")]/b/text()|.//td[contains(@id, \"tdo_24\")]/b/text()' YEAR_XPATH = './/td[contains(@id, \"tdo_18\")]/text()' ENGINE_XPATH = './/td[contains(@id, \"tdo_15\")]/text()' GEARBOX_XPATH = './/td[contains(@id, \"tdo_35\")]/text()' MILEAGE_XPATH = './/td[contains(@id, \"tdo_16\")]/text()' TA_XPATH = './/td[contains(@id, \"tdo_223\")]/text()' PRICE_XPATH = './/td[contains(@id, \"tdo_8\")]/text()|.//span[contains(@id, \"tdo_8\")]/text()' BODY_XPATH = './/td[contains(@id, \"tdo_32\")]/text()' LOCATION_XPATH = './/td[@class=\"ads_contacts\"]/text()' # Description arr = response.xpath(DESC_XPATH).extract() # Array of lines of text eol = map(lambda s: str(s).replace('\r\n', ' '), arr) # EOL lf = map(lambda s: str(s).replace('\n', ' '), eol) # LF cr = list(map(lambda s: str(s).replace('\r', ' '), lf)) # CR desc = ''.join(cr) # Make make = response.xpath(MAKE_XPATH).extract_first() # Year year = Sanitizer.sanitizeDate( response.xpath(YEAR_XPATH).extract_first()) # Engine & engine type engine = None engineType = None engine_str = response.xpath(ENGINE_XPATH).extract_first() if engine_str != None: split = engine_str.split() engine = split[0] if len(split) > 1: engineType = split[1] # Gearbox gearbox = response.xpath(GEARBOX_XPATH).extract_first() # Mileage mileage = Sanitizer.sanitizeMileage( response.xpath(MILEAGE_XPATH).extract_first()) # TA ta = Sanitizer.sanitizeInspection( response.xpath(TA_XPATH).extract_first()) # Price price = Sanitizer.sanitizePrice( response.xpath(PRICE_XPATH).extract_first()) # Body body = response.xpath(BODY_XPATH).extract_first() # Location contacts = response.xpath(LOCATION_XPATH).extract() location = next(contact for contact in contacts if contact != None and contact != ' ') if Sanitizer.isCarValid(make, price, ta): file = open(self.fileName, 'a', newline='', encoding='utf8') writer = csv.writer(file) writer.writerow([ make, desc, year, engine, engineType, gearbox, mileage, body, ta, price, location, response.request.url ]) file.close()
def testDefaultValueMiles(self): self.assertEqual(Sanitizer.sanitizeMileage('111 222'), 111222)
def testAlreadyGoodValueMiles(self): self.assertEqual(Sanitizer.sanitizeMileage('111222'), 111222)
def testReverseCaseInspection(self): self.assertEqual( Sanitizer.sanitizeInspection('2019.10'), datetime.datetime(year=2019, month=10, day=1).strftime(time_format))
def testFaultyCaseInspection(self): self.assertEqual(Sanitizer.sanitizeInspection('None'), None)
def testAlreadyGoodValueDate(self): self.assertEqual(Sanitizer.sanitizeDate('2008'), '2008')
def testDefaultCaseInspection(self): self.assertEqual( Sanitizer.sanitizeInspection('10.2019'), datetime.datetime(year=2019, month=10, day=1).strftime(time_format))
def testEmptyDate(self): self.assertEqual(Sanitizer.sanitizeDate(None), None)
def testOtherValueDate(self): self.assertEqual(Sanitizer.sanitizeDate('abc'), None)
def testDefaultCaseDate(self): self.assertTrue(Sanitizer.sanitizeDate('2008 aprīlis'), '2008')
def testHugeValuePrice(self): self.assertEqual(Sanitizer.sanitizePrice('25 000 $'), '25000')
def testAlreadyGoodValuePrice(self): self.assertEqual(Sanitizer.sanitizePrice('500'), '500')
def testEmptyPrice(self): self.assertEqual(Sanitizer.sanitizePrice(None), None)
def testDefaultCasePrice(self): self.assertEqual(Sanitizer.sanitizePrice('5000 $'), '5000')