Beispiel #1
0
    def deal_comment(self, response):
        datajson = json.loads(response.body)
        datajson_comments = datajson['comments']

        data = response.meta
        comments_data = []

        if not datajson_comments:
            Save_result(plantform='chengdu',
                        date_time=data['publish_time'],
                        urlOruid=data['url'],
                        newsidOrtid=data['id'],
                        datatype='news',
                        full_data=data)
            return
        else:
            for someone_comment in datajson_comments:
                # id=i['comment_id']
                # content=i['content']
                # publish_time=i['create_time']
                # publish_user=i['passport']['']
                content = someone_comment['content']  # content
                id = someone_comment['comment_id']  # id
                publish_user_photo = someone_comment['passport'][
                    'img_url']  # publish_user_photo
                publish_user = someone_comment['passport'][
                    'nickname']  # publish_user
                publish_user_id = someone_comment['passport'][
                    'user_id']  # publish_user_id
                create_time = someone_comment['create_time']  # publish_time
                spider_time = time.time()

                thiscomments = {
                    'content': content,
                    'id': id,
                    'publish_user_photo': publish_user_photo,
                    'publish_user': publish_user,
                    'publish_user_id': publish_user_id,
                    'create_time': create_time,
                    'spider_time': spider_time
                }
                comments_data.append(thiscomments)
            data['reply_node'] = comments_data

        Save_result(plantform='chengdu',
                    date_time=data['publish_time'],
                    urlOruid=data['url'],
                    newsidOrtid=data['id'],
                    datatype='news',
                    full_data=data)
Beispiel #2
0
    def deal_comment2(
            self,
            response):  #处理https://apiv2.sohu.com/api/comment/list?这样的url返回的评论
        try:
            data_json = json.loads(response.body)
            if data_json['jsonObject']['error_code']:
                return scrapy.Request(
                    url=
                    'http://apiv2.sohu.com/api/topic/load?page_size=10&topic_source_id=502873239&page_no=1&hot_size=5',
                    meta={
                        'plant_form': 'None',
                        'download_timeout': 3,
                        'isIndex_request': False
                    })
        except Exception as e:
            # yield scrapy.Request(url='http://apiv2.sohu.com/api/topic/load?page_size=10&topic_source_id=502873239&page_no=1&hot_size=5')
            pass
            try:
                data_json = json.loads(
                    response.body.split('(')[1].split(')')[0])
            except Exception as e:
                # print response.body
                Save_result(plantform='sohu',
                            date_time=response.meta['publish_time'],
                            urlOruid=response.meta['url'],
                            newsidOrtid=response.meta['newsid'],
                            datatype='news',
                            full_data={'data': response.meta['data']})

                return

        reply_nodes = []
        print data_json
        for comment in data_json['jsonObject']['comments']:
            publish_user = comment['passport']['nickname']
            publish_user_id = comment['passport']['user_id']
            publish_time = comment['create_time']
            publish_user_photo = comment['passport']['img_url']
            content = comment['content']
            reply_count = comment['reply_count']
            url = response.url
            id = comment['comment_id']
            child_node = {
                'publish_user': publish_user,
                'publish_user_id': publish_user_id,
                'publish_time': publish_time,
                'publish_user_photo': publish_user_photo,
                'content': content,
                'reply_count': reply_count,
                'url': url,
                'id': id
            }
            response.meta['data']['reply_nodes'].append(child_node)
Beispiel #3
0
    def deal_comment(self, response):
        #头条网的评论智能抓这么多
        thismeta_data = response.meta['data']

        datajson_comment = json.loads(response.body)
        datajson_comment_data_comment = datajson_comment['data']['comments']
        for one_comment in datajson_comment_data_comment:
            content = one_comment['text']  #content
            like_count = one_comment['digg_count']  #like_count
            publish_time = one_comment['create_time']  #publish_time
            id = one_comment['id']  #id
            # reply_count= one_comment['reply_count']#reply_count
            publish_user = one_comment['user']['name']  #publish_user
            publish_user_photo = one_comment['user'][
                'avatar_url']  #publish_user_photo
            publish_user_id = one_comment['user']['user_id']  #publish_user_id
            reply_count = one_comment['reply_count']  #reply_count
            url = response.url
            reply_nodes = []
            time_format = '%Y-%m-%d'
            spider_time = time.strftime(time_format, time.localtime())
            one_nodes = {
                'content': content,
                'like_count': like_count,
                'publish_time': publish_time,
                'id': id,
                'reply_count': reply_count,
                'publish_user': publish_user,
                'publish_user_photo': publish_user_photo,
                'publish_user_id': publish_user_id,
                'url': url,
                'reply_nodes': reply_nodes,
                'spider_time': spider_time
            }
            thismeta_data['reply_nodes'].append(one_nodes)

        Save_result(plantform='toutiao',
                    date_time=response.meta['data']['publish_time'],
                    urlOruid=response.meta['data']['url'],
                    newsidOrtid=response.meta['data']['id'],
                    datatype='news',
                    full_data=response.meta['data'])

        print datajson_comment['data']['total']
Beispiel #4
0
    def deal_comment3(self, response):  #这里的评论处理是最后一个的时候,在其他的处理模块里都处理不了的时候才处理的,
        #要注意的是,现在一共发现了3个comment评论的来源链接
        try:
            thiscommentList = []
            data_json = json.loads(response.body)
            if not data_json['jsonObject']['comments']:
                print 'no informathion in comment3'
                return
            for comment in data_json['jsonObject']['comments']:
                comment['content'] = comment['content']
                comment['like_count'] = comment['support_count']  # 赞成数
                print comment['comments']  # 是否有自评论
                comment['id'] = comment['comment_id']  # 言论id
                comment['publish_time'] = comment['create_time']  #
                comment['reply_count'] = len(comment['comments'])
                comment['publish_user'] = comment['passport']['nickname']
                comment['url'] = response.url
                comment['sonid'] = comment['reply_id']  # 父贴id
                thiscommentList.append(comment)
            data = response.meta
            data['reply_nodes'].append(data)

            #http://apiv2.sohu.com/api/topic/load?page_size=10&topic_source_id=502873239&page_no=1&hot_size=5
            url_this_comment = response.url.split('page_no=')
            url_next_comment = url_this_comment[0] + 'page_no=' + str(
                int(url_this_comment[1].split('&')[0]) +
                1) + '&' + url_this_comment[1].split('&')[1]
            print url_next_comment
            yield scrapy.Request(url=url_next_comment,
                                 meta={
                                     'data': data,
                                     'plant_form': 'None',
                                     'download_timeout': 3,
                                     'isIndex_request': False
                                 })
        except Exception as e:
            print e
            Save_result(plantform='sohu',
                        date_time=response.meta['publish_time'],
                        urlOruid=response.meta['url'],
                        newsidOrtid=response.meta['newsid'],
                        datatype='news',
                        full_data={'data': response.meta['data']})
Beispiel #5
0
    def deal_comment(self, response):
        if response.request.cookies:
            cookies = response.request.cookies
        else:
            cookies = {}
        headers = response.request.headers
        if 'Set-Cookie' in headers.keys():
            print response.headers['Set-Cookie']
            for headers_key in response.headers.keys():
                if 'Set-Cookie' in headers_key:
                    set_cookie = response.headers[headers_key]
                    cookies_name = set_cookie.split(';')[0].split('=')
                    cookies[cookies_name[0]] = cookies_name[1]
                else:
                    headers[headers_key] = response.headers[headers_key]

        thismeta = response.meta

        datajson_original = response.body.split('(')[1].split(')')[0]
        datajson = json.loads(datajson_original)
        for one_comment in datajson['contentAll']:
            id = one_comment['commentId']  #id
            publish_user_photo = one_comment['userImgUrl']  #publish_user_photo
            publish_user = one_comment['nickName']  #publish_user
            publish_time = one_comment['commentTime']  #publis_time
            content = one_comment['content']  #content
            publish_user_id = one_comment['userId']  #publish_user_id
            like_count = one_comment['upAmount']  #like_count
            reply_count = len(one_comment['parent'])  #reply_count
            url = response.url  #url
            reply_nodes = []
            video_urls = one_comment['videoUrl']  #video_urls
            for reply_one_node in one_comment['parent']:
                reply_node_id = reply_one_node['commentId']  # id
                reply_node_publish_user_photo = reply_one_node[
                    'userImgUrl']  # publish_user_photo
                reply_node_publish_user = reply_one_node[
                    'nickName']  # publish_user
                reply_node_publish_time = reply_one_node[
                    'commentTime']  # publis_time
                reply_node_content = reply_one_node['content']  # content
                reply_node_publish_user_id = reply_one_node[
                    'userId']  # publish_user_id
                reply_node_like_count = reply_one_node[
                    'upAmount']  # like_count
                # reply_node_reply_count = len(reply_one_node['parent'])  # reply_count
                reply_node_url = response.url  # url
                reply_node_video_urls = reply_one_node[
                    'videoUrl']  # video_urls
                reply_node_reply_nodes = []
                thisreply_node = {
                    'id': id,
                    'publish_user_photo': reply_node_publish_user_photo,
                    'publish_user': reply_node_publish_user,
                    'publish_time': reply_node_publish_time,
                    'content': reply_node_content,
                    'publish_user_id': reply_node_publish_user_id,
                    'like_count': reply_node_like_count,
                    # 'reply_count': reply_node_reply_count,
                    'url': reply_node_url,
                    'video_urls': reply_node_video_urls,
                    'reply_nodes': reply_node_reply_nodes
                }
                reply_nodes.append(thisreply_node)

            thiscomment = {
                'id': id,
                'publish_user_photo': publish_user_photo,
                'publish_user': publish_user,
                'publish_time': publish_time,
                'content': content,
                'publish_user_id': publish_user_id,
                'like_count': like_count,
                'reply_count': reply_count,
                'url': url,
                'video_urls': video_urls,
                'reply_nodes': reply_nodes
            }
            thismeta['data']['reply_nodes'].append(thiscomment)
        if len(datajson['contentAll']) > 9:
            thisurl = response.url
            thisurl_split = thisurl.split('pid=')
            next_url = thisurl_split[0] + 'pid=' + str(
                int(thisurl_split[1]) + 1)
            yield scrapy.Request(url=next_url,
                                 meta=thismeta,
                                 cookies=cookies,
                                 headers=headers)
        else:
            Save_result(plantform='xinhuanet',
                        date_time=response.meta['data']['publish_time'],
                        urlOruid=response.meta['data']['url'],
                        newsidOrtid=response.meta['data']['id'],
                        datatype='news',
                        full_data={'data': thismeta['data']})
Beispiel #6
0
    def commentDeal(self, response):

        contentdict = {}
        # thiscommentLastId=None
        preCommentDict = response.meta['data']
        newsid = response.meta['newsid']
        # contentdict['data']={}
        thiscommentList = []

        #因为这里的回复结构是倒叙的
        def getOneComment(comment):  #调整回复的结构
            thiscommentdict = {}
            # thiscommentdict['content'] =comment['content'].encode('utf-8')
            thiscommentdict['content'] = comment['content']
            thiscommentdict['like_count'] = comment['support_count']  # 赞成数
            print comment['comments']  # 是否有自评论
            thiscommentdict['id'] = comment['comment_id']  # 言论id
            thiscommentdict['publish_time'] = comment['create_time']  #
            thiscommentdict['reply_count'] = len(comment['comments'])
            thiscommentdict['publish_user'] = comment['passport']['nickname']
            thiscommentdict['url'] = response.url
            thiscommentdict['sonid'] = comment['reply_id']  #父贴id
            thiscommentList.append(thiscommentdict)

        #     for childcomment in comment['comments']:
        #         getOneComment(childcomment)
        #
        # def reply_Structure(reply_dict_list):
        #     dict1=reply_dict_list.pop()
        #     if reply_dict_list:
        #         dict1['reply_nodes']=reply_Structure(reply_dict_list)
        #     else:
        #         dict1['reply_nodes']=[]
        #     return dict1

        dataunicode = unicode(response.body, encoding='GBK',
                              errors='ignore')  #处理编码
        # dataunicode=dataunicode.encode('utf-8')
        dataunicode = dataunicode.encode('utf-8')
        datajson = json.loads(dataunicode)
        if datajson['data']['comments']:  #看评论回复是不是空的,空的很有可能就是爬完了,跳到存储模块
            for jsoncomments in datajson['data']['comments']:
                getOneComment(jsoncomments)
                if len(thiscommentList) > 1:
                    contentdict['data'] = thiscommentList.pop()
                    contentdict['data']['reply_nodes'] = []
                    contentdict['data']['reply_count'] = len(
                        thiscommentList) + 1
                    # contentdict['data']['reply_nodes'].append(reply_Structure(thiscommentList))#后边要求不要这个模块

                elif len(thiscommentList) == 1:  #这里处理的都是某一个评论,最终都将会汇聚到一个list中
                    contentdict['data'] = thiscommentList.pop()
                    contentdict['data']['reply_nodes'] = []
                else:
                    contentdict['data'] = []
                thiscommentList = []
                preCommentDict['reply_nodes'].append(contentdict['data'])
                preCommentDict['newsid'] = newsid
                # commentjson=json.dumps(contentdict)#这是某一个评论的,现在做成json还早了点
                # print commentjson
                # thiscommentList['']

            try:
                thiscommentLastId = datajson['data']['comments'][-1][
                    'comment_id']
                print response.url
                # https://m.sohu.com/reply/api/comment/list/cursor?newsId=498936235&pageSize=15&preCursor=0&isLogin=true
                commenturlnowSplit = response.url.split('preCursor=')
                commenturlnext = commenturlnowSplit[0] + 'preCursor=' + str(
                    thiscommentLastId) + '&' + commenturlnowSplit('&')[1]
                yield scrapy.Request(url=commenturlnext,
                                     meta={
                                         'data': preCommentDict,
                                         'newsid': newsid,
                                         'plant_form': 'None'
                                     })
            except Exception as e:
                print e, 'wrong1'

        else:
            Save_result(plantform='sohu',
                        date_time=response.meta['publish_time'],
                        urlOruid=response.meta['url'],
                        newsidOrtid=response.meta['newsid'],
                        datatype='news',
                        full_data={'data': response.meta['data']})
        print '----------------------------------------'
Beispiel #7
0
    def deal_comment(self,response):
        headers = response.request.headers
        for i in response.headers:
            headers[i] = response.headers[i]


        print response.body
        # print response.body
        print '--------'
        print response.meta
        print '--------'
        json_in_comment=json.loads(response.body)
        json_in_comment_data=json_in_comment['data']
        if json_in_comment_data:
            json_in_comment_data_postreply=json_in_comment_data['postreply']
            for post in json_in_comment_data_postreply:
                content= post['content']#
                like_count= post['favtimes']#like_count
                publish_user_id= post['uid']#publish_user_id
                id= post['tid']#tid
                url= response.url#url
                publish_user= post['author']['nickname']#publish_user
                publish_user_photo= post['author']['headimgurl']#publish_user_photo       maybe is no heading----http://panda.qq.com/static/images/noheadimg.png
                time_format = '%Y-%m-%d'
                spider_time = time.strftime(time_format, time.localtime())#spider_time
                # time1=time.localtime()
                publish_time_stamp_9=time.localtime(float(post['pubtime']+'.00'))
                publish_time= time.strftime(time_format,publish_time_stamp_9)#publish_time
                comment_only_one={
                    'content':content,
                    'publish_user_id':publish_user_id,
                    'like_count':like_count,
                    'id':id,
                    'url':url,
                    'publish_user':publish_user,
                    'publish_user_photo':publish_user_photo,
                    'spider_time':spider_time,
                    'publish_time':publish_time,
                }
                response.meta['reply_nodes'].append(comment_only_one)
                response.meta['download_timeout']=3
                response.meta['isIndex_request']=True

            this_comment_url=response.url.split('&page=')
            next_comment_url=this_comment_url[0]+'&page='+str(int(this_comment_url[1].split('&')[0])+1)+'&sort=time&size=20'
            response.meta['plant_form']='None'#7-18日添加,因为评论的url都是一样的,如果检测的话,会导致出问题
            yield scrapy.Request(url=next_comment_url,headers=headers,meta=response.meta)
        else:#请求完成
            resultdict={
                'data':{
                    'like_count':response.meta['like_count'],
                    'content':response.meta['content'],
                    'id':response.meta['id'],
                    'img_urls':response.meta['img_urls'],
                    'publish_time':response.meta['publish_time'],
                    'publish_user_id':response.meta['publish_user_id'],
                    'publish_user_photo':response.meta['publish_user_photo'],
                    'publish_user':response.meta['publish_user'],
                    'read_count':response.meta['read_count'],
                    'reply_count':response.meta['reply_count'],
                    'title':response.meta['title'],
                    'reply_nodes':response.meta['reply_nodes'],
                    'url':response.meta['url'],
                    'reproduce_count':response.meta['reproduce_count'],
                    'spider_time':response.meta['spider_time']
                    # 'read_count':response.meta['read_count'],


                }
            }
            result_json=json.dumps(resultdict)
            Save_result(plantform='mycdqq',date_time=response.meta['publish_time'],urlOruid=response.meta['url'],newsidOrtid=response.meta['id'],datatype='news',full_data=resultdict)