def handle(self,datas): # print '____',datas data = datas.xpath("div[@class='item']") address = self.mackining('address',data) name = self.mackining('name',data) url = self.mackining('url',data) score = self.mackining('score',data) SCORES = re.search(u'\s*([0-5])\s*',score) score = int(SCORES.group(1)) if SCORES else '' title = self.mackining('title',data) comment = self.mackining('comment',data) commentid = self.mackining('commentid',data) buytime = self.mackining('buytime',data) useful = int(self.mackining('useful',data)) reply = int(self.mackining('reply',data)) buytime = ProcessData.str_datetime(buytime) commenttime = self.mackining('commenttime',data) commenttime = ProcessData.str_datetime(commenttime) return { 'address': address, 'name': name, 'url': url, 'score': score, 'title': title, 'comment': comment, 'commentid': commentid, 'buytime': buytime, 'commenttime': commenttime, 'province': address, 'city': '', 'useful': useful, 'reply': reply # 'city': city }
def crawl(self): ecid = self.data['uuid'] goodsNo = str(self.key) category_data = extract_category(self) totalpage = int(self.get_page(goodsNo)) if totalpage == 0: return for i in range(totalpage + 1): url = self.get_url(goodsNo, i) json = ProcessData.get_json_data(url) appraise = json['appraiseArray'] for item in appraise: commentid = item['id'] summary = item['summary'] score = item['appraiseGrade'] userorderid = item['appraiseName'] commenttime = ProcessData.str_datetime(item['appraiseTime']) comment_data = { 'eid': ecid, #commodity table foreign key 'source_id': goodsNo, 'source': self.data.get('source'), 'comment_id': item['id'], #review id 'score': item['appraiseGrade'], #commodity score 'pubtime': ProcessData.str_datetime(item['appraiseTime']), 'user_name': item['appraiseName'], 'content': item['summary'], 'brand': self.data['brand'], 'version': self.data['version'], 'series': self.data['series'], 'comment': { 'is_Bbc': self.data['is_Bbc'], 'skuID': self.data['skuID'], } } comment_data.update(category_data) comment_data.update(get_ctime()) model = EcCommentModel(comment_data) export(model)
def crawl(self): ecid = self.data['uuid'] goodsNo = str(self.key) category_data = extract_category(self) totalpage = int(self.get_page(goodsNo)) if totalpage == 0: print "get_page fail" return {} for i in range(totalpage): url = self.get_url(goodsNo, i) json = ProcessData.get_json_data(url) try: appraise = json['appraiseArray'] except Exception, e: self.logger.error(url) self.logger.error(e) print "get appraise fail" for item in appraise: commentid = item['id'] summary = item['summary'] score = item['appraiseGrade'] userorderid = item['appraiseName'] commenttime = ProcessData.str_datetime(item['appraiseTime']) # print commentid # print summary.encode('utf-8') comment_data = { 'ecid': ecid, #commodity table foreign key 'source_id': goodsNo, 'source': self.data.get('source'), 'comment_id': item['id'], #review id 'score': item['appraiseGrade'], #commodity score 'pubtime': ProcessData.str_datetime(item['appraiseTime']), 'user_id': item['appraiseName'], 'content': item['summary'] } comment_data.update(category_data) model = EcCommentModel(comment_data) export(model)