def deal_comment(self, response): datajson = json.loads(response.body) datajson_comments = datajson['comments'] data = response.meta comments_data = [] if not datajson_comments: Save_result(plantform='chengdu', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=data) return else: for someone_comment in datajson_comments: # id=i['comment_id'] # content=i['content'] # publish_time=i['create_time'] # publish_user=i['passport'][''] content = someone_comment['content'] # content id = someone_comment['comment_id'] # id publish_user_photo = someone_comment['passport'][ 'img_url'] # publish_user_photo publish_user = someone_comment['passport'][ 'nickname'] # publish_user publish_user_id = someone_comment['passport'][ 'user_id'] # publish_user_id create_time = someone_comment['create_time'] # publish_time spider_time = time.time() thiscomments = { 'content': content, 'id': id, 'publish_user_photo': publish_user_photo, 'publish_user': publish_user, 'publish_user_id': publish_user_id, 'create_time': create_time, 'spider_time': spider_time } comments_data.append(thiscomments) data['reply_node'] = comments_data Save_result(plantform='chengdu', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=data)
def deal_comment2( self, response): #处理https://apiv2.sohu.com/api/comment/list?这样的url返回的评论 try: data_json = json.loads(response.body) if data_json['jsonObject']['error_code']: return scrapy.Request( url= 'http://apiv2.sohu.com/api/topic/load?page_size=10&topic_source_id=502873239&page_no=1&hot_size=5', meta={ 'plant_form': 'None', 'download_timeout': 3, 'isIndex_request': False }) except Exception as e: # yield scrapy.Request(url='http://apiv2.sohu.com/api/topic/load?page_size=10&topic_source_id=502873239&page_no=1&hot_size=5') pass try: data_json = json.loads( response.body.split('(')[1].split(')')[0]) except Exception as e: # print response.body Save_result(plantform='sohu', date_time=response.meta['publish_time'], urlOruid=response.meta['url'], newsidOrtid=response.meta['newsid'], datatype='news', full_data={'data': response.meta['data']}) return reply_nodes = [] print data_json for comment in data_json['jsonObject']['comments']: publish_user = comment['passport']['nickname'] publish_user_id = comment['passport']['user_id'] publish_time = comment['create_time'] publish_user_photo = comment['passport']['img_url'] content = comment['content'] reply_count = comment['reply_count'] url = response.url id = comment['comment_id'] child_node = { 'publish_user': publish_user, 'publish_user_id': publish_user_id, 'publish_time': publish_time, 'publish_user_photo': publish_user_photo, 'content': content, 'reply_count': reply_count, 'url': url, 'id': id } response.meta['data']['reply_nodes'].append(child_node)
def deal_comment(self, response): #头条网的评论智能抓这么多 thismeta_data = response.meta['data'] datajson_comment = json.loads(response.body) datajson_comment_data_comment = datajson_comment['data']['comments'] for one_comment in datajson_comment_data_comment: content = one_comment['text'] #content like_count = one_comment['digg_count'] #like_count publish_time = one_comment['create_time'] #publish_time id = one_comment['id'] #id # reply_count= one_comment['reply_count']#reply_count publish_user = one_comment['user']['name'] #publish_user publish_user_photo = one_comment['user'][ 'avatar_url'] #publish_user_photo publish_user_id = one_comment['user']['user_id'] #publish_user_id reply_count = one_comment['reply_count'] #reply_count url = response.url reply_nodes = [] time_format = '%Y-%m-%d' spider_time = time.strftime(time_format, time.localtime()) one_nodes = { 'content': content, 'like_count': like_count, 'publish_time': publish_time, 'id': id, 'reply_count': reply_count, 'publish_user': publish_user, 'publish_user_photo': publish_user_photo, 'publish_user_id': publish_user_id, 'url': url, 'reply_nodes': reply_nodes, 'spider_time': spider_time } thismeta_data['reply_nodes'].append(one_nodes) Save_result(plantform='toutiao', date_time=response.meta['data']['publish_time'], urlOruid=response.meta['data']['url'], newsidOrtid=response.meta['data']['id'], datatype='news', full_data=response.meta['data']) print datajson_comment['data']['total']
def deal_comment3(self, response): #这里的评论处理是最后一个的时候,在其他的处理模块里都处理不了的时候才处理的, #要注意的是,现在一共发现了3个comment评论的来源链接 try: thiscommentList = [] data_json = json.loads(response.body) if not data_json['jsonObject']['comments']: print 'no informathion in comment3' return for comment in data_json['jsonObject']['comments']: comment['content'] = comment['content'] comment['like_count'] = comment['support_count'] # 赞成数 print comment['comments'] # 是否有自评论 comment['id'] = comment['comment_id'] # 言论id comment['publish_time'] = comment['create_time'] # comment['reply_count'] = len(comment['comments']) comment['publish_user'] = comment['passport']['nickname'] comment['url'] = response.url comment['sonid'] = comment['reply_id'] # 父贴id thiscommentList.append(comment) data = response.meta data['reply_nodes'].append(data) #http://apiv2.sohu.com/api/topic/load?page_size=10&topic_source_id=502873239&page_no=1&hot_size=5 url_this_comment = response.url.split('page_no=') url_next_comment = url_this_comment[0] + 'page_no=' + str( int(url_this_comment[1].split('&')[0]) + 1) + '&' + url_this_comment[1].split('&')[1] print url_next_comment yield scrapy.Request(url=url_next_comment, meta={ 'data': data, 'plant_form': 'None', 'download_timeout': 3, 'isIndex_request': False }) except Exception as e: print e Save_result(plantform='sohu', date_time=response.meta['publish_time'], urlOruid=response.meta['url'], newsidOrtid=response.meta['newsid'], datatype='news', full_data={'data': response.meta['data']})
def deal_comment(self, response): if response.request.cookies: cookies = response.request.cookies else: cookies = {} headers = response.request.headers if 'Set-Cookie' in headers.keys(): print response.headers['Set-Cookie'] for headers_key in response.headers.keys(): if 'Set-Cookie' in headers_key: set_cookie = response.headers[headers_key] cookies_name = set_cookie.split(';')[0].split('=') cookies[cookies_name[0]] = cookies_name[1] else: headers[headers_key] = response.headers[headers_key] thismeta = response.meta datajson_original = response.body.split('(')[1].split(')')[0] datajson = json.loads(datajson_original) for one_comment in datajson['contentAll']: id = one_comment['commentId'] #id publish_user_photo = one_comment['userImgUrl'] #publish_user_photo publish_user = one_comment['nickName'] #publish_user publish_time = one_comment['commentTime'] #publis_time content = one_comment['content'] #content publish_user_id = one_comment['userId'] #publish_user_id like_count = one_comment['upAmount'] #like_count reply_count = len(one_comment['parent']) #reply_count url = response.url #url reply_nodes = [] video_urls = one_comment['videoUrl'] #video_urls for reply_one_node in one_comment['parent']: reply_node_id = reply_one_node['commentId'] # id reply_node_publish_user_photo = reply_one_node[ 'userImgUrl'] # publish_user_photo reply_node_publish_user = reply_one_node[ 'nickName'] # publish_user reply_node_publish_time = reply_one_node[ 'commentTime'] # publis_time reply_node_content = reply_one_node['content'] # content reply_node_publish_user_id = reply_one_node[ 'userId'] # publish_user_id reply_node_like_count = reply_one_node[ 'upAmount'] # like_count # reply_node_reply_count = len(reply_one_node['parent']) # reply_count reply_node_url = response.url # url reply_node_video_urls = reply_one_node[ 'videoUrl'] # video_urls reply_node_reply_nodes = [] thisreply_node = { 'id': id, 'publish_user_photo': reply_node_publish_user_photo, 'publish_user': reply_node_publish_user, 'publish_time': reply_node_publish_time, 'content': reply_node_content, 'publish_user_id': reply_node_publish_user_id, 'like_count': reply_node_like_count, # 'reply_count': reply_node_reply_count, 'url': reply_node_url, 'video_urls': reply_node_video_urls, 'reply_nodes': reply_node_reply_nodes } reply_nodes.append(thisreply_node) thiscomment = { 'id': id, 'publish_user_photo': publish_user_photo, 'publish_user': publish_user, 'publish_time': publish_time, 'content': content, 'publish_user_id': publish_user_id, 'like_count': like_count, 'reply_count': reply_count, 'url': url, 'video_urls': video_urls, 'reply_nodes': reply_nodes } thismeta['data']['reply_nodes'].append(thiscomment) if len(datajson['contentAll']) > 9: thisurl = response.url thisurl_split = thisurl.split('pid=') next_url = thisurl_split[0] + 'pid=' + str( int(thisurl_split[1]) + 1) yield scrapy.Request(url=next_url, meta=thismeta, cookies=cookies, headers=headers) else: Save_result(plantform='xinhuanet', date_time=response.meta['data']['publish_time'], urlOruid=response.meta['data']['url'], newsidOrtid=response.meta['data']['id'], datatype='news', full_data={'data': thismeta['data']})
def commentDeal(self, response): contentdict = {} # thiscommentLastId=None preCommentDict = response.meta['data'] newsid = response.meta['newsid'] # contentdict['data']={} thiscommentList = [] #因为这里的回复结构是倒叙的 def getOneComment(comment): #调整回复的结构 thiscommentdict = {} # thiscommentdict['content'] =comment['content'].encode('utf-8') thiscommentdict['content'] = comment['content'] thiscommentdict['like_count'] = comment['support_count'] # 赞成数 print comment['comments'] # 是否有自评论 thiscommentdict['id'] = comment['comment_id'] # 言论id thiscommentdict['publish_time'] = comment['create_time'] # thiscommentdict['reply_count'] = len(comment['comments']) thiscommentdict['publish_user'] = comment['passport']['nickname'] thiscommentdict['url'] = response.url thiscommentdict['sonid'] = comment['reply_id'] #父贴id thiscommentList.append(thiscommentdict) # for childcomment in comment['comments']: # getOneComment(childcomment) # # def reply_Structure(reply_dict_list): # dict1=reply_dict_list.pop() # if reply_dict_list: # dict1['reply_nodes']=reply_Structure(reply_dict_list) # else: # dict1['reply_nodes']=[] # return dict1 dataunicode = unicode(response.body, encoding='GBK', errors='ignore') #处理编码 # dataunicode=dataunicode.encode('utf-8') dataunicode = dataunicode.encode('utf-8') datajson = json.loads(dataunicode) if datajson['data']['comments']: #看评论回复是不是空的,空的很有可能就是爬完了,跳到存储模块 for jsoncomments in datajson['data']['comments']: getOneComment(jsoncomments) if len(thiscommentList) > 1: contentdict['data'] = thiscommentList.pop() contentdict['data']['reply_nodes'] = [] contentdict['data']['reply_count'] = len( thiscommentList) + 1 # contentdict['data']['reply_nodes'].append(reply_Structure(thiscommentList))#后边要求不要这个模块 elif len(thiscommentList) == 1: #这里处理的都是某一个评论,最终都将会汇聚到一个list中 contentdict['data'] = thiscommentList.pop() contentdict['data']['reply_nodes'] = [] else: contentdict['data'] = [] thiscommentList = [] preCommentDict['reply_nodes'].append(contentdict['data']) preCommentDict['newsid'] = newsid # commentjson=json.dumps(contentdict)#这是某一个评论的,现在做成json还早了点 # print commentjson # thiscommentList[''] try: thiscommentLastId = datajson['data']['comments'][-1][ 'comment_id'] print response.url # https://m.sohu.com/reply/api/comment/list/cursor?newsId=498936235&pageSize=15&preCursor=0&isLogin=true commenturlnowSplit = response.url.split('preCursor=') commenturlnext = commenturlnowSplit[0] + 'preCursor=' + str( thiscommentLastId) + '&' + commenturlnowSplit('&')[1] yield scrapy.Request(url=commenturlnext, meta={ 'data': preCommentDict, 'newsid': newsid, 'plant_form': 'None' }) except Exception as e: print e, 'wrong1' else: Save_result(plantform='sohu', date_time=response.meta['publish_time'], urlOruid=response.meta['url'], newsidOrtid=response.meta['newsid'], datatype='news', full_data={'data': response.meta['data']}) print '----------------------------------------'
def deal_comment(self,response): headers = response.request.headers for i in response.headers: headers[i] = response.headers[i] print response.body # print response.body print '--------' print response.meta print '--------' json_in_comment=json.loads(response.body) json_in_comment_data=json_in_comment['data'] if json_in_comment_data: json_in_comment_data_postreply=json_in_comment_data['postreply'] for post in json_in_comment_data_postreply: content= post['content']# like_count= post['favtimes']#like_count publish_user_id= post['uid']#publish_user_id id= post['tid']#tid url= response.url#url publish_user= post['author']['nickname']#publish_user publish_user_photo= post['author']['headimgurl']#publish_user_photo maybe is no heading----http://panda.qq.com/static/images/noheadimg.png time_format = '%Y-%m-%d' spider_time = time.strftime(time_format, time.localtime())#spider_time # time1=time.localtime() publish_time_stamp_9=time.localtime(float(post['pubtime']+'.00')) publish_time= time.strftime(time_format,publish_time_stamp_9)#publish_time comment_only_one={ 'content':content, 'publish_user_id':publish_user_id, 'like_count':like_count, 'id':id, 'url':url, 'publish_user':publish_user, 'publish_user_photo':publish_user_photo, 'spider_time':spider_time, 'publish_time':publish_time, } response.meta['reply_nodes'].append(comment_only_one) response.meta['download_timeout']=3 response.meta['isIndex_request']=True this_comment_url=response.url.split('&page=') next_comment_url=this_comment_url[0]+'&page='+str(int(this_comment_url[1].split('&')[0])+1)+'&sort=time&size=20' response.meta['plant_form']='None'#7-18日添加,因为评论的url都是一样的,如果检测的话,会导致出问题 yield scrapy.Request(url=next_comment_url,headers=headers,meta=response.meta) else:#请求完成 resultdict={ 'data':{ 'like_count':response.meta['like_count'], 'content':response.meta['content'], 'id':response.meta['id'], 'img_urls':response.meta['img_urls'], 'publish_time':response.meta['publish_time'], 'publish_user_id':response.meta['publish_user_id'], 'publish_user_photo':response.meta['publish_user_photo'], 'publish_user':response.meta['publish_user'], 'read_count':response.meta['read_count'], 'reply_count':response.meta['reply_count'], 'title':response.meta['title'], 'reply_nodes':response.meta['reply_nodes'], 'url':response.meta['url'], 'reproduce_count':response.meta['reproduce_count'], 'spider_time':response.meta['spider_time'] # 'read_count':response.meta['read_count'], } } result_json=json.dumps(resultdict) Save_result(plantform='mycdqq',date_time=response.meta['publish_time'],urlOruid=response.meta['url'],newsidOrtid=response.meta['id'],datatype='news',full_data=resultdict)