예제 #1
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        commentList = list()
        comment_url = 'http://120.55.151.61/Treehole/V4/Message/getDetail.action'
        data = 'platform=1&phoneBrand=HONOR&phoneVersion=24&versionNumber=9.2.0&phoneModel=BLN-AL10&channel=huaweiMarket&plateId=1&messageId=' + str(
            article.tid) + '&'
        data_comment = self.s.post(url=comment_url,
                                   data=data,
                                   headers=self.headers)
        data0 = data_comment.text
        true = True
        false = False
        data_j0 = eval(data0)
        data_js0 = json.dumps(data_j0)
        data_dict0 = simplejson.loads(data_js0)
        try:
            comment_list = data_dict0['data']['commentListBO']['commentBOs']
        except:
            return (commentList, False)
        if len(comment_list) == 0:
            return (commentList, False)

        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        for comment in comment_list:
            cid = comment['commentId']
            commentTimeStamp = comment['commentTime'] / 1000
            datetime_struct = datetime.datetime.fromtimestamp(commentTimeStamp)
            commentTime = datetime_struct.strftime('%Y-%m-%d %H:%M:%S')
            commentContent = comment['content']
            user_id = comment['student']['id']
            user_name = comment['student']['nickName']

            location_coutry = 'CN'

            commentList.append(
                Comment(
                    article.tid,
                    self.channel.channel_id,
                    cid,
                    add_datetime,
                    commentTime,
                    None,
                    location_coutry,
                    None,
                    None,  ###这里的ip_address还未实现
                    user_id,
                    user_name,
                    commentContent,
                    None,
                    None,
                    None,
                    None,
                    dislike_count=None))

        return (commentList, False)
예제 #2
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        html = self.session.download(article.url, encoding='utf-8', data=False, timeout=10, retry=3, addr=True)
        channel = re.findall(r"channel: '(.*)',", html['html'])[0]
        newsid = re.findall(r"newsid: '(.*)',", html['html'])[0]
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        commentList = list()
        page = 1
        self.logger.info(article.url)
        try:
            while page < 30:
                data ={
                    'channel': channel,
                    'newsid': newsid,
                    'group': '',
                    'compress': '1',
                    'ie': 'gbk',
                    'oe': 'gbk',
                    'page': page,
                    'page_size': '20'
                }
                re_url = 'http://comment5.news.sina.com.cn/page/info'
                html1 = self.session.download(url=re_url, encoding='utf-8', data=data, timeout=10, retry=3, addr=True)
                html1 = html1["html"]
                html1 = re.sub(r'(.*=)\{', '{', html1)
                html1 = json.loads(html1)
                totalcount = html1['result']['count']['show']
                if totalcount == 0:
                    break
                cmntlist = html1["result"]["cmntlist"]
                for i in cmntlist:
                    cid = i["mid"]
                    user_id = i["uid"]
                    user_name = i["nick"]
                    user_ip = i["ip"]
                    publish_datetime = i["time"]
                    like_count = i["agree"]
                    content = i["content"]
                    commentList.append(Comment(article.tid, self.channel.channel_id, cid,add_datetime, publish_datetime, user_ip, None, None, None,user_id, user_name,content,None, None, like_count, None, None))

                totalpage = math.ceil(totalcount / 20.0)

                if totalpage < page:
                    break
                page = page + 1
        except:
            self.logger.error(self.logger.error('Fail to parse comment:%s'+traceback.format_exc()))
        finally:
            return (commentList, False)
예제 #3
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        #self.logger.debug('Article:%s', article)
        comment_url = 'http://comment5.news.sina.com.cn/page/info?channel=blog&newsid='
        comment_url = comment_url + article.meta_info
        self.logger.error(comment_url)
        html = self.session.get(comment_url)
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        commentList = list()
        main_comment = json.loads(html)['result']['cmntlist']
        print '8989'
        if len(main_comment) == 0:
            print '12212'
            return (commentList, False)

        for comment in main_comment:
            cid = comment['mid']
            publish_datetime = comment['time']
            user_id = comment['uid']
            user_name = comment['nick']
            content = comment['content']
            location = ','.join(jieba.cut(comment['area']))
            location_region = location[:location.find(',')]
            location_city = location[location.find(',') + 1:]
            print location_city
            location_coutry = 'CN'

            commentList.append(
                Comment(article.tid,
                        self.channel.channel_id,
                        cid,
                        add_datetime,
                        publish_datetime,
                        None,
                        location_coutry,
                        location_region,
                        location_city,
                        user_id,
                        user_name,
                        content,
                        None,
                        None,
                        None,
                        None,
                        dislike_count=None))

        return (commentList, False
                )  #测试的时候 article[0][222].content  可以取出第222条的评论内容
예제 #4
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        #self.logger.debug('Article:%s', article)

        html = self.session.get(article.url, headers=self.headers)
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        commentList = list()
        main_comment = BeautifulSoup(html,'lxml')
        try:
            commentList_html = main_comment.find('ul',attrs={'id':"comments"})
            commentList_html_li = commentList_html.findAll('li')
        except:
            self.logger.error(u'该页面已被删除或则存在其他格式:%s',article.url)
            return (commentList, False)

        for comment in commentList_html_li:
            cid = comment.attrs['data-cid']
            comment_main_wrap = comment.find('div',attrs={'class':"reply-doc content"})
            user_id = comment_main_wrap.find('div',attrs={'class':"operation_div"}).attrs['id']
            user_info_wrap = comment_main_wrap.find('div',attrs={'class':"bg-img-green"})
            user_name = user_info_wrap.find('a').text.strip()
            publish_datetime = user_info_wrap.find('span').text.strip()
            content = comment_main_wrap.find('p').text.strip()
            reply_user_wrap = comment_main_wrap.find('div',attrs={'class':"reply-quote"})
            if reply_user_wrap:
                reply_userid_wrap=reply_user_wrap.find('span',attrs={'class':"pubdate"})
                reply_userid = reply_userid_wrap.find('a').attrs['href']
                reply_userid = re.sub(r'\D', "", reply_userid)
            else:
                reply_userid = ''
            # like_count_wrap = comment_main_wrap.find('div',attrs={'class':"operation_div"})
            # like_count = like_count_wrap.findAll('a')[1].text
            # like_count = re.sub(r'\D', "", like_count) #点赞数难获取不是页面自带的
            like_count = None
            reply_count = None
            location_coutry = 'CN'

            commentList.append(Comment(article.tid,  self.channel.channel_id, cid,
                                   add_datetime, publish_datetime,
                                    None,location_coutry,None,None,
                                   user_id, user_name, content, reply_userid,
                                   None, like_count, reply_count, dislike_count=None
                                   ))

        return (commentList, False)  #测试的时候 article[0][222].content  可以取出第222条的评论内容
예제 #5
0
파일: zhihu1.py 프로젝트: magicgis/outfile
 def crawlComment(self, article):
     '''
     根据文章,爬取文章的评论,返回评论列表
     @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
     '''
     if self.nextCommentUrl is None:
         curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0)
         curl = curl.replace('#', '%')
     else:
         curl = self.nextCommentUrl
     self.session.randomSleep()
     result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS)
     jo = json.loads(result)
     paging = jo['paging']
     hasnext = not paging['is_end']
     self.nextCommentUrl = paging['next']
     dataList = jo['data']
     add_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     commentList = list()
     for data in dataList:
         #self.logger.debug('[Zhihu]Comment data keys:%s', data.keys())
         #self.logger.debug('[ZHIHU]Comment url for %s:%s', article.title, data['url'])
         publish_datetime = time.strftime(
             '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time']))
         comment = Comment(article.tid,
                           article.channel_id,
                           data['id'],
                           add_datetime,
                           publish_datetime,
                           ip_address=None,
                           location_country=None,
                           location_region=None,
                           location_city=None,
                           author_id=data['author']['id'],
                           author_name=data['author']['name'],
                           content=data['content'],
                           reply_author_id=None,
                           read_count=None,
                           like_count=data['voteup_count'],
                           reply_count=data['comment_count'],
                           dislike_count=None)
         commentList.append(comment)
     return (commentList, hasnext)
예제 #6
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        # return (list(), False)
        commentList = list()

        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        headers = {
            'Accept':
            '* / *',
            'Accept-Encoding':
            'gzip, deflate, sdch, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Content - Type':
            'application / x - www - form - urlencoded',
            'Host':
            'weibo.com',
            'Referer':
            '%s' % article.url[:article.url.find('?') + 1],
            'refer_flag':
            '1001030103_ & type = comment',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
        }
        data1 = {'ajwvr': '6', 'id': '%s' % article.tid, 'from': 'singleWeiBo'}

        re_url = 'https://weibo.com/aj/v6/comment/big'

        html = self.session.get(re_url,
                                params=data1,
                                headers=headers,
                                cookies=self.__load_cookies_from_lwp(
                                    self.user_name))

        jo = json.loads(html)
        data = jo['data']['html']
        # self.logger.error( data)
        soup = BeautifulSoup(data, 'lxml')
        if soup.find('div', {'class': "WB_empty"}) is None:

            commentbox = soup.find('div', {'class': "list_box"})
            root_commentlist = commentbox.find('div',
                                               {'node-type': "comment_list"})
            root_commentlist2 = root_commentlist.find_all(
                'div', {'node-type': "root_comment"})
            count_comment = 1
            self.logger.debug('root_commentlist:%d   %s',
                              len(root_commentlist2), article.url)
            for root_comment in root_commentlist2:

                if count_comment > 20:
                    break  # 爬取前20条评论
                self.logger.error('count_comment:%d', count_comment)
                comment_id = root_comment.attrs['comment_id']  # 一级评论id
                list_con = root_comment.find('div', {'class': "list_con"})
                firstcomentwrap = list_con.find('div', {'class': "WB_text"})
                firstcoment = firstcomentwrap.text.strip()  # 一级评论内容

                useridwrap = firstcomentwrap.find('a')
                user_id = useridwrap.attrs['usercard']  # 一级评论者id
                user_id = re.findall(r'id=(\d+)', user_id)[0]

                user_name = useridwrap.test  # 一级评论者name
                if user_name is None:
                    user_name = ' '
                publish_timediv = list_con.find('div',
                                                {'class': "WB_func clearfix"})
                try:
                    publish_time = self.parseDateTime(
                        publish_timediv.findAll('div')[1].text)  # 一级评论发布时间
                except:
                    continue
                # self.logger.error(publish_time)
                # if publish_time < datetime.datetime.now()-datetime.timedelta(hours=48):
                #     break

                like_count_div = publish_timediv.findAll('div')[0]
                try:
                    like_count_li = like_count_div.findAll('li')[3]
                    like_count = like_count_li.findAll('em')[1].text
                except:
                    like_count = 0
                if u'赞' == like_count:
                    like_count = 0

                commentList.append(
                    Comment(article.tid,
                            self.channel.channel_id,
                            comment_id,
                            add_datetime,
                            publish_time,
                            None,
                            None,
                            None,
                            None,
                            user_id,
                            user_name,
                            firstcoment,
                            None,
                            None,
                            like_count,
                            None,
                            dislike_count=None))
                count_comment += 1  # 评论数计数

        return (commentList, False)
예제 #7
0
파일: zhihu.py 프로젝트: magicgis/outfile
    def crawlComment(self, article):
        '''
        根据文章,爬去文章的评论,返回评论列表
        :param article:
        :return:(commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        try:
            commentList = list()
            count = 0
            if (article.url.find('question') != -1):
                if self.nextCommentUrl is None:
                    curl = COMMENT_URL_TEMPLATE % (article.tid,
                                                   COMMENT_PAGE_SIZE, 0)
                    curl = curl.replace('#', '%')
                else:
                    curl = self.nextCommentUrl
                self.lastCrawlTime = time.time()
                self.randomSleep()
                result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS)
                jo = json.loads(result.text)
                paging = jo['paging']
                hasnext = not paging['is_end']
                self.nextCommentUrl = paging['next']
                dataList = jo['data']
                add_datetime = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                for data in dataList:
                    publish_datetime = time.strftime(
                        '%Y-%m-%d %H:%M:%S',
                        time.localtime(data['updated_time']))
                    dr = re.compile(r'<[^>]+>', re.S)
                    content = r"'%s'" % (str(dr.sub('', data['content'])))
                    comment = Comment(article.tid,
                                      article.channel_id,
                                      data['id'],
                                      add_datetime,
                                      publish_datetime,
                                      ip_address=None,
                                      location_country=None,
                                      location_region=None,
                                      location_city=None,
                                      author_id=data['author']['id'],
                                      author_name=data['author']['name'],
                                      content=content,
                                      reply_author_id=None,
                                      read_count=None,
                                      like_count=data['voteup_count'],
                                      reply_count=data['comment_count'],
                                      dislike_count=None)
                    count += 1
                    print count
                    commentList.append(comment)
                    if count > 500:
                        print count
                        hasnext = False
                        break
                return (commentList, hasnext)

            if (article.url.find('zhuanlan') != -1):
                offset = 0

                comment_url = 'https://zhuanlan.zhihu.com/api/posts/%s/comments?limit=10&offset=%d' % (
                    article.tid, offset)
                self.lastCrawlTime = time.time()
                self.randomSleep()
                response = self.session.get(comment_url,
                                            headers=CRAWL_COMMENT_HEADERS)
                if (response.text == '[]' or count > 500):
                    return (list(), False)
                dataList = json.loads(response.text)
                add_datetime = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                for data in dataList:
                    publish_datetime = str(data['createdTime'])[0:19].replace(
                        'T', ' ')
                    content = r"'%s'" % str(data['content'])
                    comment = Comment(article.tid,
                                      article.channel_id,
                                      data['id'],
                                      add_datetime,
                                      publish_datetime,
                                      ip_address=None,
                                      location_country=None,
                                      location_region=None,
                                      location_city=None,
                                      author_id=data['author']['uid'],
                                      author_name=data['author']['name'],
                                      content=content,
                                      reply_author_id=None,
                                      read_count=None,
                                      like_count=data['likesCount'],
                                      reply_count=None,
                                      dislike_count=data['dislikesCount'])
                    commentList.append(comment)
                    print count
                    count += 1
                    if count > 500:
                        print count
                        break
                offset += 10
                return (commentList, False)
        except:
            self.logger.debug(traceback.format_exc())
            return (
                list(), False
            )  #不要返回None  会报nonetype has no method of len()错误  (sacrawler.py里)
예제 #8
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        commentList = list()
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        page = 1
        while page <= 30:
            data = {'pn': page}
            html = self.session.download(article.url,
                                         encoding='utf-8',
                                         data=data,
                                         isJson=False,
                                         timeout=10,
                                         retry=3,
                                         addr=True)
            article_url = article.url
            # print article_url
            soup = BeautifulSoup(html['html'])

            try:
                main = soup.find('div', attrs={'class': "left_section"})
                main = main.find('div', attrs={'id': "j_p_postlist"})
            except:
                self.logger.warn(u'很抱歉,该贴已被删除。%s', article_url)
                return (commentList, False)

            sectionsite = main.find_all('div', attrs={'class': "l_post"})
            # self.logger.error(len(sectionsite))

            index = 0

            if main:

                com_all = main.find_all('div', attrs={'data-field': True})

                for i in sectionsite[2:]:
                    # self.logger.warn(i)

                    index = index + 1

                    if com_all[index].attrs['data-field']:

                        try:
                            data_field = i.attrs['data-field'].strip()
                        except:
                            self.logger.error(u'存在未找到的data-field')
                            self.logger.error(article_url)
                            continue

                        data_field = json.loads(data_field)
                        if 'content' in data_field.keys():
                            # self.logger.warn(u'这里真的会不糊出错2')
                            cid = data_field['content']['post_id']
                            user_id = data_field['author']['user_id']
                            user_name = data_field['author']['user_name']
                            # user_ip = ''
                            # ip_address = ''
                            # user_head = ''
                            if 'date' in data_field['content'].keys():
                                # self.logger.warn(u'这里没有出错%s', article_url)
                                cpublish_datetime = data_field['content'][
                                    'date']
                            else:
                                # self.logger.warn(u'这里出错%s',article_url)
                                cpublish_datetime = i.findAll('span')
                                cpublish_datetime = cpublish_datetime[
                                    -1].text.strip()
                                if u'广告' in cpublish_datetime:
                                    continue

                            cpublish_datetime = re.findall(
                                r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)',
                                cpublish_datetime)[0]

                            if len(cpublish_datetime[4]) > 1:
                                cpublish_datetime = cpublish_datetime[0] + '-' + cpublish_datetime[1] + '-' + \
                                                    cpublish_datetime[2] + ' ' + cpublish_datetime[3] + cpublish_datetime[4]
                            else:
                                cpublish_datetime = cpublish_datetime[0] + '-' + cpublish_datetime[1] + '-' + \
                                                   cpublish_datetime[2] + ' ' + cpublish_datetime[3] + ':00'
                            # reply_userid = ''
                            # like_count =
                            # unlike_count = -1
                            # read_count = -1
                            reply_count = data_field['content']['comment_num']
                            source_url = article_url
                            content = i.find('cc').text.strip()
                            location_coutry = 'CN'
                            # channeltype = 'tieba'
                            # channel = self.site
                            # heat = 0

                            commentList.append(
                                Comment(article.tid,
                                        self.channel.channel_id,
                                        cid,
                                        add_datetime,
                                        cpublish_datetime,
                                        None,
                                        location_coutry,
                                        None,
                                        None,
                                        user_id,
                                        user_name,
                                        content,
                                        None,
                                        None,
                                        None,
                                        reply_count,
                                        dislike_count=None))

            # 翻页控制
            pages = soup.find('li', attrs={'class': "l_pager"}).find_all('a')
            pageList = []
            for p in pages:
                pa = p.text.strip()
                pageList.append(pa)
            if str(page + 1) in pageList:
                page += 1
            else:
                break
        return (commentList, False)
예제 #9
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''

        cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420'
        html = self.session.download(article.url,
                                     encoding='gbk',
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     cookies=cookies)
        article_url = article.url

        soup = BeautifulSoup(html['html'], 'html.parser')
        try:
            sid = soup.find('meta', attrs={
                'name': "contentid"
            }).attrs['content']
        except:
            return (list(), False)
        sid = re.sub(r'\D', '', sid)
        bbs = 'http://bbs1.people.com.cn/postLink.do?nid=' + sid
        # bbs = soup.find('div', attrs={'class': "message"})
        # if bbs:
        # bbs = bbs.find('a')
        # if bbs:
        # bbs = bbs.attrs['href']
        # else:
        # bbs = 'http://bbs1.people.com.cn/postLink.do?nid='
        # print bbs
        # else:
        # return None

        commentList = list()
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        html1 = self.session.download(bbs,
                                      encoding='gbk',
                                      data=None,
                                      isJson=False,
                                      timeout=10,
                                      retry=3)
        soup1 = BeautifulSoup(html1, 'html.parser')
        id = soup1.find('meta', attrs={'name': "contentid"})
        if id:
            id = id.attrs['content']
            id = re.sub(r'\D', '', id)
            re_url = 'http://bbs1.people.com.cn/api/postApi.do'
            page = 1
            while page < 30:
                data1 = {
                    'action': 'postDetailByParentId',
                    'replayPostId': id,
                    'pageNo': page
                }
                html2 = self.session.download(re_url,
                                              encoding='utf-8',
                                              data=data1,
                                              isJson=False,
                                              timeout=10,
                                              retry=3)
                html2 = re.sub(r'\\\\\\', '', html2)
                html2 = re.sub(r'"\[\\"', '[', html2)
                html2 = re.sub(r'\\"\]"', ']', html2)
                html2 = re.sub(r'\\",\\"', ',', html2)
                html2 = json.loads(html2)
                totalCount = html2['totalCount']
                if totalCount == 0:
                    break
                replayPosts = html2['replayPosts']
                if replayPosts:
                    for i in replayPosts:
                        cid = i['id']
                        user_id = i['userId']
                        user_name = i['userNick']
                        user_ip = i['userIP']
                        # ip_address = get_ip_address(str(user_ip))
                        # ip_address = ''
                        user_head = ''
                        publish_datetime = time.strftime(
                            '%Y-%m-%d %H:%M:%S',
                            time.localtime(float(i['createTime']) / 1000))
                        reply_userid = i['parentId']
                        like_count = i['vote_yes']
                        unlike_count = i['vote_no']
                        read_count = i['readCount']
                        reply_count = i['replyCount']
                        source_url = article_url
                        content = i['contentText']
                        heat = 0
                        location_coutry = 'CN'
                        # print cid, user_id, user_name, user_ip, ip_address, user_head, publish_datetime, reply_userid
                        # print like_count,unlike_count,read_count,reply_count,source_url
                        commentList.append(
                            Comment(
                                article.tid,
                                self.channel.channel_id,
                                cid,
                                add_datetime,
                                publish_datetime,
                                user_ip,
                                location_coutry,
                                None,
                                None,  ###这里的ip_address还未实现
                                user_id,
                                user_name,
                                content,
                                reply_userid,
                                None,
                                like_count,
                                reply_count,
                                dislike_count=None))
                    pageCount = html2['pageCount']  # 评论总页数
                    if pageCount == page:
                        break
                    page = page + 1  # 评论页数+1
                else:
                    break
        return (commentList, False)
예제 #10
0
    def crawlComment(self, article):
        # 获取文章评论
        meta_info = article.meta_info
        if meta_info is None:
            return (list(), False)
        jo = json.loads(meta_info)
        if "commentid" not in jo:
            return (list(), False)
        commentid = jo["commentid"]
        cookies = 'pac_uid=0_58ec8106620c1; gj_mpvid=80515918; ad_play_index=97; dsp_cookiemapping0=1492586667155; pgv_info=ssid=s9259450720; ts_last=news.qq.com/a/20170415/002007.htm; ts_refer=www.baidu.com/link; pgv_pvid=1281052383; ts_uid=1143064466; ptag=www_baidu_com|'
        re_url = 'http://coral.qq.com/article/' + commentid + '/comment'
        commentList = list()
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        data1 = {
            'commentid':
            self.lastCommentId if self.lastCommentId is not None else '0',
            'reqnum': '50'
        }
        html = self.session.download(re_url,
                                     encoding='utf-8',
                                     cookies=cookies,
                                     data=data1,
                                     timeout=10,
                                     retry=3)
        jo = json.loads(html)
        if jo['errCode'] != 0:
            return ([], False)
        if jo['data']['retnum'] == 0:
            return ([], False)
        self.lastCommentId = jo['data']['last']
        for i in jo['data']['commentid']:
            cid = i['id']
            user_id = i['userinfo']['userid']
            user_name = i['userinfo']['nick']
            user_ip = ''
            location = i['userinfo']['region'].replace(u'市', '').replace(
                u'自治', '').replace(u'新区', '').replace(u'区',
                                                      '').replace(u'洲', '')
            location_list = location.split(':')

            location_country = location_list[0]
            if len(location_list) > 1:
                location_region = location_list[1]
            else:
                location_region = ''
            if len(location_list) > 2:
                location_city = location_list[2]
            else:
                location_city = ''
            #user_head = i['userinfo']['head']

            publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S',
                                             time.localtime(i['time']))
            reply_userid = str(i['replyuserid'])
            like_count = i['up']
            reply_count = i['rep']
            content = i['content']
            # print cid, user_id, user_name, user_ip, ip_address, user_head, publish_datetime, reply_userid
            # print like_count,unlike_count,read_count,reply_count,source_url
            commentList.append(
                Comment(article.tid, self.channel.channel_id, cid,
                        add_datetime, publish_datetime, user_ip,
                        location_country, location_region, location_city,
                        user_id, user_name, content, reply_userid, None,
                        like_count, reply_count, None))
        return (commentList, jo['data']['hasnext'])
예제 #11
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        #self.logger.debug('Article:%s', article)

        html = self.session.download(article.url,
                                     encoding='utf-8',
                                     data=None,
                                     timeout=10,
                                     retry=3,
                                     addr=True,
                                     isJson=False)
        # meta_info = article.meta_info
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        # add_datetime = time.mktime(time.strptime('','%Y-%m-%d'))
        commentList = list()
        page = 1
        while page < 30:
            doc_url = re.findall(r'"commentUrl":"(.*)",', html['html'])
            if doc_url:
                doc_url = doc_url[0]
            else:
                doc_url = article.url

            data1 = {
                'callback': 'newCommentListCallBack',
                'orderby': '',
                'docUrl': doc_url,
                'job': '1',
                'p': page,
                'callback': 'newCommentListCallBack'
            }
            re_url = 'http://comment.ifeng.com/get.php'
            html1 = self.session.download(re_url,
                                          encoding='gbk',
                                          data=data1,
                                          timeout=10,
                                          retry=3,
                                          isJson=True)
            totalcount = html1['count']  # 评论总数
            if totalcount == 0:
                break
            comments = html1['comments']
            if comments:
                for comment in comments:
                    cid = comment['comment_id']
                    user_id = comment['user_id']
                    user_name = comment['uname']
                    user_ip = comment['client_ip']
                    # ip_address = get_ip_address(self, str(user_ip))  # 并没有获取到值
                    # if ip_address is '':
                    try:
                        ip_address = comment['ip_from']
                    except:
                        ip_address = None
                    # ip_address = comment['ip_from']
                    user_head = comment['user_url']
                    publish_datetime = time.strftime(
                        '%Y-%m-%d %H:%M:%S',
                        time.localtime(float(comment['create_time'])))
                    reply_userid = comment['parent']  # 评论的回复
                    if reply_userid:
                        reply_userid = comment['parent'][0]['user_id']
                    else:
                        reply_userid = ''
                    like_count = comment['uptimes']
                    unlike_count = None
                    read_count = None
                    reply_count = None
                    source_url = article.url
                    content = comment['comment_contents']
                    heat = 0
                    location_coutry = 'CN'

                    if ip_address is None:
                        commentList.append(
                            Comment(
                                article.tid,
                                self.channel.channel_id,
                                cid,
                                add_datetime,
                                publish_datetime,
                                user_ip,
                                None,
                                None,
                                None,  ###这里的ip_address还未实现
                                user_id,
                                user_name,
                                content,
                                reply_userid,
                                None,
                                like_count,
                                reply_count,
                                dislike_count=None))
                    else:
                        try:
                            location_region = ip_address[:ip_address.
                                                         find(u'省') + 1]
                            location_city = ip_address[ip_address.find(u'省') +
                                                       1:]
                        except:
                            location_region = None
                            location_city = None
                        commentList.append(
                            Comment(article.tid,
                                    self.channel.channel_id,
                                    cid,
                                    add_datetime,
                                    publish_datetime,
                                    user_ip,
                                    location_coutry,
                                    location_region,
                                    location_city,
                                    user_id,
                                    user_name,
                                    content,
                                    reply_userid,
                                    None,
                                    like_count,
                                    reply_count,
                                    dislike_count=None))

                page = page + 1
                totalpage = math.ceil(totalcount / 20.0)  # 计算评论总页数,向上取整
                if totalpage < page:
                    break
            else:
                break
        return (commentList, False
                )  #测试的时候 article[0][222].content  可以取出第222条的评论内容
예제 #12
0
파일: jianshu.py 프로젝트: magicgis/outfile
    def crawlComment(self, article):
        '''

        :param article:文章的实例
        :return:返回(commentList,hasnext)commentList是文章的评论,hasnext代表是否继续爬虫
        '''
        html = self.session.get(article.url,
                                headers=self.headers,
                                verify=False)
        soup = BeautifulSoup(html, 'html.parser')
        commentList = list()
        add_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        try:

            div = soup.find('div', attrs={'class': 'normal-comment-list'})
            divs = div.findAll('div', attrs={'class': 'comment'})
        except:
            self.logger.error(u'该页面已经被删除:%s', article.url)
            #print('该文章没有评论')
            return (commentList, False)
        for c in divs:
            pre_cid = c.attrs['id']
            cid = re.sub('\D', '', pre_cid)  #评论的id
            user_id = None
            name_div = c.find('div', attrs={'class': 'info'})
            user_name = name_div.find('a').text  #评论者的名字
            pre_time = name_div.find('div', attrs={'class': 'meta'})
            publish_time = pre_time.find('span').text.strip()  #评论的时间
            publish_time = publish_time[-16:]
            pre_content = c.find('div', attrs={'class': 'comment-wrap'})
            content = pre_content.find('p').text
            reply_user_wrap = c.find('div',
                                     attrs={'class': 'sub-comment-list'})
            if reply_user_wrap:
                pre_reply_user_id = reply_user_wrap.find(
                    'div', attrs={'class': 'sub-comment'})
                reply_user_id = re.sub('\D', '', pre_reply_user_id)
            else:
                reply_user_id = ' '
            like_count = None
            reply_count = None
            location_coutry = 'CN'
            commentList.append(
                Comment(article.tid,
                        self.channel.channel_id,
                        cid,
                        add_time,
                        publish_time,
                        None,
                        location_coutry,
                        None,
                        None,
                        user_id,
                        user_name,
                        content,
                        reply_user_id,
                        None,
                        like_count,
                        reply_count,
                        dislike_count=None))
        return (commentList, False
                )  #测试的时候 article[0][222].content  可以取出第222条的评论内容
예제 #13
0
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''
        cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056'
        if len(article.tid) != 16:
            articleid = article.tid[3:len(article.tid) - 2]
        else:
            articleid = article.tid
        re_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid + '/comments/newList'
        commentList = list()
        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        page = 0
        while page < 750:
            data1 = {
                'offset': page,
                'limit': 30,
                'showLevelThreshold': 72,
                'headLimit': 1,
                'tailLimit': 2,
                'ibc': 'newspc'
            }
            try:
                html1 = self.session.download(url=re_url,
                                              encoding='utf-8',
                                              cookies=cookies,
                                              data=data1,
                                              timeout=10,
                                              retry=3)
                html1 = json.loads(html1)
                totalcount = html1["newListSize"]

                if totalcount == 0:
                    break
                for i in html1['comments'].itervalues():
                    cid = i['commentId']
                    user_id = i['user']['userId']
                    if user_id == 0:
                        user_name = ''
                    else:
                        user_name = i['user']['nickname']
                    user_ip = ''
                    location = i['user']['location'].replace(
                        u'市', ':').replace(u'自治',
                                           ':').replace(u'新区', ':').replace(
                                               u'区', ':').replace(u'洲', ':')
                    location_list = location.split(':')

                    location_country = location_list[0]
                    if len(location_list) > 1:
                        location_region = location_list[1]
                    else:
                        location_region = ''
                    if len(location_list) > 2:
                        location_city = location_list[2]
                    else:
                        location_city = ''
                    publish_datetime = i['createTime']
                    like_count = i['vote']
                    unlike_count = i['against']
                    content = i['content']
                    dr = re.compile(r'<[^>]+>', re.S)
                    content = dr.sub('', i['content'])
                    commentList.append(
                        Comment(articleid, self.channel.channel_id, cid,
                                add_datetime, publish_datetime, user_ip,
                                location_country, location_region,
                                location_city, user_id, user_name, content,
                                None, None, like_count, None, unlike_count))
                # print page, totalcount
                if page > int(totalcount):
                    break
                page = page + 30
            except:
                self.logger.error('[NeteaseComment]url:' + article.url +
                                  ', tid:' + article.tid + ', %s' +
                                  traceback.format_exc())
                return
            finally:
                return (commentList, False)
예제 #14
0
파일: tianya.py 프로젝트: magicgis/outfile
    def crawlComment(self, article):
        '''
        根据文章,爬取文章的评论,返回评论列表
        @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取
        '''

        commentList = list()

        add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        html = self.session.download(article.url,
                                     encoding='utf-8',
                                     data=None,
                                     isJson=False,
                                     timeout=10,
                                     retry=3)
        article_url = article.url
        soup = BeautifulSoup(html, 'html.parser')
        comments = soup.find_all(
            lambda tag: tag.name == 'div' and tag.get('class') == ['atl-item'])
        for i in comments:
            cid = i.attrs['replyid']
            user_id = i.attrs['_hostid']
            user_name = i.attrs['_host']
            # user_head = i.find('div', attrs={'class': "atl-info"}).find('a').attrs['href'] #楼主name
            cpublish_datetime = i.attrs['js_restime']
            reply_userid = ''  # 评论父id
            like_count = i.find('a', attrs={'class': "zan"}).attrs['_count']

            reply_count = i.find('div', attrs={
                'class': "atl-reply"
            }).find('a', attrs={
                'title': "插入评论"
            }).text.strip()
            reply_count = re.findall(r'\d+', reply_count)
            if reply_count:
                reply_count = reply_count[0]
            else:
                reply_count = 0

            content = i.find('div', attrs={
                'class': "bbs-content"
            }).text.strip()
            location_coutry = 'CN'
            commentList.append(
                Comment(article.tid,
                        self.channel.channel_id,
                        cid,
                        add_datetime,
                        cpublish_datetime,
                        None,
                        location_coutry,
                        None,
                        None,
                        user_id,
                        user_name,
                        content,
                        reply_userid,
                        None,
                        like_count,
                        reply_count,
                        dislike_count=None))

        return (commentList, False)