def parseContent(self, response): hxs = Selector(text=response.body) try: self.phone = hxs.css('ul.textsdtdd li::text').extract()[0] except IndexError: pass try: self.website = hxs.css('p.topusc5_0::text').extract()[0] except IndexError: pass # print(self.website) rows = hxs.xpath( '//div[@class="rdct_0"]/table/tr/td/b/text()').extract() if len(rows) > 0: time = rows[0] if time: try: time = time.split('-') t = time[0].strip() dateObj = datetime.strptime(t, '%I:%M %p') self.startTime = dateObj.strftime('%H:%M:%S') # print self.startTime t = time[1].strip() dateObj = datetime.strptime(t, '%I:%M %p') self.endTime = dateObj.strftime('%H:%M:%S') # print self.endTime except (ValueError, IndexError): pass self.listTagId = [] tag = Tag() #khung gia: 2tr -10tr rows = hxs.xpath('//div[@class="rdct_0"]/table/tr').extract() for row in rows: listTd = Selector(text=row).xpath('//td/p/text()').extract() if len(listTd) > 0: left = listTd[0] listTd = Selector(text=row).xpath('//td/b/text()').extract() right = listTd[0] if left.find(u'giá') > 0: self.tag = right self.listTagId.append(tag.getIdTagFromName( self.tag, 16339)) rows = hxs.xpath( '//div[@class="rdct_0"]/table/tr/td/div/p[@class="imgtiddtt"]/text()' ).extract() # print 'haha' # print rows for idx, row in enumerate(rows): if row == u'Tiện ích': xpath = '//div[@class="rdct_0"]/table/tr/td/div' rr = hxs.xpath(xpath).extract() rrr = Selector(text=rr[idx]).xpath( '//p[@class="bleftdd_1"]/a/text()').extract() # print rrr for r in rrr: t = r.strip() if t != u'Khác': self.tag += ',' + t self.listTagId.append(tag.getIdTagFromName(t, 16359)) # print self.tag rows = hxs.xpath( '//div[@class="ndungleftdct"]/div[@class="ndleft_0"]/p/text()' ).extract() if len(rows) > 0: self.description = rows[0] # print self.description meta = response.meta self.typeId = meta["typeId"] cityId = meta["cityId"] rows = hxs.xpath( '//div[@class="rdct_0"]/p[@class="rdctfollow_0"]/span[@class="rdctfollow_5"]/text()' ).extract() if len(rows) == 3: district = rows[2][8:].strip() city = City() self.districtId = city.getIdProvinceFromCity(cityId, district)
def parseContent(self, response): hxs = Selector(text=response.body) try: self.phone = hxs.css('ul.textsdtdd li::text').extract()[0] except IndexError: pass try: self.website = hxs.css('p.topusc5_0::text').extract()[0] except IndexError: pass # print(self.website) rows = hxs.xpath( '//div[@class="rdct_0"]/table/tr/td/b/text()').extract() if len(rows) > 0: time = rows[0] if time: try: time = time.split('-') t = time[0].strip() dateObj = datetime.strptime(t, '%I:%M %p') self.startTime = dateObj.strftime('%H:%M:%S') # print self.startTime t = time[1].strip() dateObj = datetime.strptime(t, '%I:%M %p') self.endTime = dateObj.strftime('%H:%M:%S') # print self.endTime except (ValueError, IndexError): pass if len(rows) >= 3: self.tag = rows[2].strip() rows = hxs.xpath( '//div[@class="rdct_0"]/table/tr/td/div/p[@class="bleftdd_1"]/a/text()' ).extract() for row in rows: self.tag += ',' + row.strip() # print self.tag rows = hxs.xpath( '//div[@class="ndungleftdct"]/div[@class="ndleft_0"]/p/text()' ).extract() if len(rows) > 0: self.description = rows[0] # print self.description meta = response.meta self.typeId = meta["typeId"] cityId = meta["cityId"] rows = hxs.xpath( '//div[@class="rdct_0"]/p[@class="rdctfollow_0"]/span[@class="rdctfollow_5"]/text()' ).extract() if len(rows) == 3: district = rows[2][8:].strip() city = City() self.districtId = city.getIdProvinceFromCity(cityId, district) print "districtId=", self.districtId