def parse(self, response): data = json.loads(response.text)['data'] # if not data: # self.runFlag = False # print u'爬取结束' # print len(data) for item in data: try: group_id = item['group_id'] comment_count = item['comment_count'] title = item['title'] article_url = item['article_url'] media_name = item['media_name'] datetime = formatTime(item['datetime']) abstract = item['abstract'] publish_time = formatTimestamp(item['publish_time']) behot_time = formatTimestamp(item['behot_time']) yield Article(group_id=group_id, comment_count=comment_count, title=title, article_url=article_url, offset=0, media_name=media_name, datetime=datetime, abstract=abstract, publish_time=publish_time, behot_time=behot_time) except Exception, e: continue pass
def parse(self, response): "从搜索结果中直接解析出相关文章的评论" data = json.loads(response.text)['data'] for item in data: try: group_id = item['group_id'] comment_count = item['comment_count'] title = item['title'] article_url = item['article_url'] media_name = item['media_name'] datetime = formatTime(item['datetime']) abstract = item['abstract'] publish_time = formatTimestamp(item['publish_time']) behot_time = formatTimestamp(item['behot_time']) if datetime < self.lastTime: print "datetime:", datetime print "lastTime:", self.lastTime self.runFlag = False except Exception, e: continue pass yield Article(group_id=group_id, comment_count=comment_count, title=title, article_url=article_url, offset=0, media_name=media_name, datetime=datetime, abstract=abstract, publish_time=publish_time, behot_time=behot_time)
def parse(self, response): "从搜索结果中直接解析出相关文章的评论" data = json.loads(response.text)['data'] article_url = response.meta['article_url'] title = response.meta['title'] if len(data) < 20: url = response.meta['url'] dict = {} dict['len'] = len(data) dict['article_url'] = article_url dict['title'] = title dict['url'] = url self.urls.append(dict) for item in data: data = item['comment'] id = data['id'] reply_count = data['reply_count'] digg_count = data['digg_count'] create_time = formatTimestamp(data['create_time']) score = data['score'] user_id = data['user_id'] user_name = data['user_name'] text = data['text'] yield Comment(user_name=user_name, text=text, article_url=article_url, title=title, id=id, reply_count=reply_count, digg_count=digg_count, create_time=create_time, score=score, user_id=user_id) pass pass
def parse(self, response): comment_id = response.meta['id'] title = response.meta['title'] text = response.meta['text'] data = json.loads(response.text)['data']['data'] # print len(data) for item in data: digg_count = item['digg_count'] content = item['content'] create_time = formatTimestamp(item['create_time']) name = item['user']['name'] id = item['id'] yield CommentReply(comment_id=comment_id, text=text, digg_count=digg_count, content=content, create_time=create_time, name=name, id=id, title=title) pass