コード例 #1
0
ファイル: host.py プロジェクト: bopopescu/vinalo
    def getKeyword(self, response):
        meta = response.meta

        id = meta['id']
        txtTag = meta['tag']

        hxs = Selector(text=response.body)
        rows = hxs.xpath(
            '//div[@class="rdct_0"]/table/tr/td/div/p[@class="imgtiddtt"]/text()'
        ).extract()

        tag = Tag()
        objectTag = ObjectTag()
        listTagId = []

        # list tag
        for idx, row in enumerate(rows):
            if row == u'Phục vụ các món':
                tag.patternTypeId = 7
                xpath = '//div[@class="rdct_0"]/table/tr/td/div'
                rr = hxs.xpath(xpath).extract()
                rrr = Selector(text=rr[idx]).xpath(
                    '//p[@class="bleftdd_1"]/a/text()').extract()
                for r in rrr:
                    t = r.strip()
                    if t != u'Khác':
                        txtTag += ',' + t
                        listTagId.append(tag.getIdTagFromName(t, 19454))

            if row == u'Phù hợp với mục đích':
                tag.patternTypeId = 0
                xpath = '//div[@class="rdct_0"]/table/tr/td/div'
                rr = hxs.xpath(xpath).extract()
                rrr = Selector(text=rr[idx]).xpath(
                    '//p[@class="bleftdd_1"]/a/text()').extract()
                for r in rrr:
                    t = r.strip()
                    if t != u'Khác':
                        txtTag += ',' + t
                        listTagId.append(tag.getIdTagFromName(t, 0))

        # print txtTag

        # update tag object
        for tagId in listTagId:
            if tagId > 0:
                # print id, tagId
                objectTag.insertNewObjectTag(id, tagId)

        self.updateKeyword(id, txtTag)
コード例 #2
0
ファイル: host.py プロジェクト: bopopescu/vinalo
    def parseContent(self, response):
        hxs = Selector(text=response.body)
        try:
            self.phone = hxs.css('ul.textsdtdd li::text').extract()[0]
        except IndexError:
            pass

        try:
            self.website = hxs.css('p.topusc5_0::text').extract()[0]
        except IndexError:
            pass
        # print(self.website)

        rows = hxs.xpath(
            '//div[@class="rdct_0"]/table/tr/td/b/text()').extract()
        if len(rows) > 0:
            time = rows[0]
            if time:
                try:
                    time = time.split('-')
                    t = time[0].strip()
                    dateObj = datetime.strptime(t, '%I:%M %p')
                    self.startTime = dateObj.strftime('%H:%M:%S')
                    # print self.startTime

                    t = time[1].strip()
                    dateObj = datetime.strptime(t, '%I:%M %p')
                    self.endTime = dateObj.strftime('%H:%M:%S')
                    # print self.endTime
                except (ValueError, IndexError):
                    pass

        self.listTagId = []
        tag = Tag()
        #khung gia: 2tr -10tr
        rows = hxs.xpath('//div[@class="rdct_0"]/table/tr').extract()
        for row in rows:
            listTd = Selector(text=row).xpath('//td/p/text()').extract()
            if len(listTd) > 0:
                left = listTd[0]
                listTd = Selector(text=row).xpath('//td/b/text()').extract()
                right = listTd[0]

                if left.find(u'giá') > 0:
                    self.tag = right
                    self.listTagId.append(tag.getIdTagFromName(
                        self.tag, 16339))

        rows = hxs.xpath(
            '//div[@class="rdct_0"]/table/tr/td/div/p[@class="imgtiddtt"]/text()'
        ).extract()
        # print 'haha'
        # print rows
        for idx, row in enumerate(rows):
            if row == u'Tiện ích':
                xpath = '//div[@class="rdct_0"]/table/tr/td/div'
                rr = hxs.xpath(xpath).extract()
                rrr = Selector(text=rr[idx]).xpath(
                    '//p[@class="bleftdd_1"]/a/text()').extract()
                # print rrr
                for r in rrr:
                    t = r.strip()
                    if t != u'Khác':
                        self.tag += ',' + t
                        self.listTagId.append(tag.getIdTagFromName(t, 16359))
        # print self.tag

        rows = hxs.xpath(
            '//div[@class="ndungleftdct"]/div[@class="ndleft_0"]/p/text()'
        ).extract()
        if len(rows) > 0:
            self.description = rows[0]
        # print self.description

        meta = response.meta
        self.typeId = meta["typeId"]

        cityId = meta["cityId"]
        rows = hxs.xpath(
            '//div[@class="rdct_0"]/p[@class="rdctfollow_0"]/span[@class="rdctfollow_5"]/text()'
        ).extract()
        if len(rows) == 3:
            district = rows[2][8:].strip()
            city = City()
            self.districtId = city.getIdProvinceFromCity(cityId, district)