def getKeyword(self, response): meta = response.meta id = meta['id'] txtTag = meta['tag'] hxs = Selector(text=response.body) rows = hxs.xpath( '//div[@class="rdct_0"]/table/tr/td/div/p[@class="imgtiddtt"]/text()' ).extract() tag = Tag() objectTag = ObjectTag() listTagId = [] # list tag for idx, row in enumerate(rows): if row == u'Phục vụ các món': tag.patternTypeId = 7 xpath = '//div[@class="rdct_0"]/table/tr/td/div' rr = hxs.xpath(xpath).extract() rrr = Selector(text=rr[idx]).xpath( '//p[@class="bleftdd_1"]/a/text()').extract() for r in rrr: t = r.strip() if t != u'Khác': txtTag += ',' + t listTagId.append(tag.getIdTagFromName(t, 19454)) if row == u'Phù hợp với mục đích': tag.patternTypeId = 0 xpath = '//div[@class="rdct_0"]/table/tr/td/div' rr = hxs.xpath(xpath).extract() rrr = Selector(text=rr[idx]).xpath( '//p[@class="bleftdd_1"]/a/text()').extract() for r in rrr: t = r.strip() if t != u'Khác': txtTag += ',' + t listTagId.append(tag.getIdTagFromName(t, 0)) # print txtTag # update tag object for tagId in listTagId: if tagId > 0: # print id, tagId objectTag.insertNewObjectTag(id, tagId) self.updateKeyword(id, txtTag)
def parseContent(self, response): hxs = Selector(text=response.body) try: self.phone = hxs.css('ul.textsdtdd li::text').extract()[0] except IndexError: pass try: self.website = hxs.css('p.topusc5_0::text').extract()[0] except IndexError: pass # print(self.website) rows = hxs.xpath( '//div[@class="rdct_0"]/table/tr/td/b/text()').extract() if len(rows) > 0: time = rows[0] if time: try: time = time.split('-') t = time[0].strip() dateObj = datetime.strptime(t, '%I:%M %p') self.startTime = dateObj.strftime('%H:%M:%S') # print self.startTime t = time[1].strip() dateObj = datetime.strptime(t, '%I:%M %p') self.endTime = dateObj.strftime('%H:%M:%S') # print self.endTime except (ValueError, IndexError): pass self.listTagId = [] tag = Tag() #khung gia: 2tr -10tr rows = hxs.xpath('//div[@class="rdct_0"]/table/tr').extract() for row in rows: listTd = Selector(text=row).xpath('//td/p/text()').extract() if len(listTd) > 0: left = listTd[0] listTd = Selector(text=row).xpath('//td/b/text()').extract() right = listTd[0] if left.find(u'giá') > 0: self.tag = right self.listTagId.append(tag.getIdTagFromName( self.tag, 16339)) rows = hxs.xpath( '//div[@class="rdct_0"]/table/tr/td/div/p[@class="imgtiddtt"]/text()' ).extract() # print 'haha' # print rows for idx, row in enumerate(rows): if row == u'Tiện ích': xpath = '//div[@class="rdct_0"]/table/tr/td/div' rr = hxs.xpath(xpath).extract() rrr = Selector(text=rr[idx]).xpath( '//p[@class="bleftdd_1"]/a/text()').extract() # print rrr for r in rrr: t = r.strip() if t != u'Khác': self.tag += ',' + t self.listTagId.append(tag.getIdTagFromName(t, 16359)) # print self.tag rows = hxs.xpath( '//div[@class="ndungleftdct"]/div[@class="ndleft_0"]/p/text()' ).extract() if len(rows) > 0: self.description = rows[0] # print self.description meta = response.meta self.typeId = meta["typeId"] cityId = meta["cityId"] rows = hxs.xpath( '//div[@class="rdct_0"]/p[@class="rdctfollow_0"]/span[@class="rdctfollow_5"]/text()' ).extract() if len(rows) == 3: district = rows[2][8:].strip() city = City() self.districtId = city.getIdProvinceFromCity(cityId, district)