Ejemplo n.º 1
0
    def step1(self, params):
        Logger.getlogging().info("MkzhanComments.STEP_1")
        # # 取得html中的commentType
        # comment_type = self.r.getid('commentType', params.content)
        #
        # # 取得html中的aboutid
        # aboutid = self.r.getid('aboutid', params.content)
        # if not comment_type  or not aboutid:
        #     Logger.getlogging().warning('{url}:40000 No commentType or No aboutid'.format(url=params.originalurl))
        #     return
        if NewsStorage.getclicknum(params.originalurl) <= 0:
            if self.r.search('<span>人气:\s<b>(.*?)<\/b>', params.content):
                clicknum = self.r.parse('<span>人气:\s<b>(.*?)<\/b>',
                                        params.content)[0]
                NewsStorage.setclicknum(params.originalurl, clicknum)

                # 获取comic_id
        comic_id = int(
            self.r.parse(r'^http[s]?://www\.mkzhan\.com/(\d+)/.*',
                         params.originalurl)[0])
        if not comic_id:
            return
        # 取得评论url
        comments_url = MkzhanComments.COMMENTS_URL % (comic_id, 1,
                                                      self.PAGE_SIZE)
        self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_2,
                      {'comic_id': comic_id})
Ejemplo n.º 2
0
    def common_step1(self, params):
        Logger.getlogging().info(params.originalurl)
        try:
            #field = self.r.parse('^http://(\w+)\.163.com?', params.originalurl)[0]
            field = params.originalurl.split('//')[-1].split('.163')[0].split('.')[-1]
            sid = params.originalurl.split('/')[-1].split('.')[0]
            #productkey = self.r.parse('\"productKey\" : \"(\w+)\"',params.content)[0]
            productkey = self.r.getid('productKey',params.content)
            commentinfo_url = Comments163.COMMENTS_URL.format(field=field, sid=sid, productkey=productkey, itemnum=0, itemlimit=self.limit)
            self.storeurl(commentinfo_url, params.originalurl, Comments163.STEP_2,{'field': field,
                                                                                    'sid': sid,
                                                                                    'productkey': productkey})             
        except:
            Logger.printexception()

        if NewsStorage.getclicknum(params.originalurl) <= 0:
            if self.r.search('^http[s]{0,1}://comment\.(\w+)\..*', params.url):
                field = self.r.parse('^http[s]{0,1}://comment\.(\w+)\..*', params.url)[0]
            else:
                field = self.r.parse('^http[s]{0,1}://(\w+)\..*', params.url)[0]
            if field == 'discovery' or field == 'data' or field == 'view':
                field = 'news'
            if field == 'cai':
                field = 'sports'

            if field != 'gongyi':
                if not self.r.search('[\s\'\"]{1}productKey[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}',
                                     params.content):
                    Logger.getlogging().warning('{url} Errorcode:40000 No productKey'.format(url=params.url))
                    return
                productKey = \
                self.r.parse('[\s\'\"]{1}productKey[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content)[0]
                if not self.r.search('[\s\'\"]{1}docId[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content):
                    Logger.getlogging().warning('{url} Errorcode:40000 No docId'.format(url=params.url))
                    return
                docId = self.r.parse('[\s\'\"]{1}docId[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content)[0]
            else:
                if self.r.search('[\s\'\"]{1}productKey[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content):
                    productKey = \
                    self.r.parse('[\s\'\"]{1}productKey[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content)[0]
                else:
                    Logger.getlogging().warning('{url}:40000 No productKey'.format(url=params.url))
                    productKey = 'a2869674571f77b5a0867c3d71db5856'
                if self.r.search('[\s\'\"]{1}docId[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content):
                    docId = \
                    self.r.parse('[\s\'\"]{1}docId[\'\"]{0,1}\s*:\s*[\'\"]{0,1}(\w+)[,\'\"]{1}', params.content)[0]
                elif self.r.search('^http[s]{0,1}://.*\.163\.com/.*/(\w+).html', params.originalurl):
                    docId = self.r.parse('/(\w+).html', params.originalurl)[0]
                else:
                    Logger.getlogging().warning('{url}:40000 No docId'.format(url=params.url))
                    return

            clickurl = self.CLICKNUM_URL.format(key=productKey, docid=docId)
            self.storeurl(clickurl, params.originalurl, self.STEP_4)
Ejemplo n.º 3
0
    def step2(self, params):
        newsid = params.customized['newsid']
        channel = params.customized['channel']
        group = params.customized['group']
        comments = json.loads(params.content)
        if not self.isvalid(comments):
            Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl))
            return
        # 获取视频的publishdate
        if self.r.search('http[s]{0,1}://.*video\.sina\.com.*', params.originalurl):
            publishdate = comments['result']['news']['time']
            NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate))
        # 获取新闻的clicknum
        elif self.r.search('http[s]{0,1}://.*\.sina\.com.*', params.originalurl):
            if NewsStorage.getclicknum(params.originalurl) <= 0:
                try:
                    news_clicknum = comments['result']['count']['total']
                    NewsStorage.setclicknum(params.originalurl, news_clicknum)
                except:
                    Logger.printexception()
        comments_count = int(comments['result']['count']['show'])
        #设置cmtnum
        NewsStorage.setcmtnum(params.originalurl, comments_count)

        cmtnum = CMTStorage.getcount(params.originalurl, True)
        if cmtnum >= comments_count:
            Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl))
            return
        pages = int(math.ceil(float(comments_count - cmtnum) / self.DEFAULT_PAGE_SIE))
        if pages >= self.maxpages:
            pages = self.maxpages
        for page in range(1, pages + 1, 1):
            if page == 1:
                self.step3(params)
                continue
            url = CommonComments.SINA_COMMENTS_URL.format(channel=channel, newsid=newsid, pn=page,
                                                          ps=SinaComments.DEFAULT_PAGE_SIE)
            if group:
                url = url + '&group=' + group
            self.storeurl(url, params.originalurl, SinaComments.STEP_COMMENT_NEXT_PAGE)
Ejemplo n.º 4
0
    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is poocgNewsComments.STEP_1:
                #Step1: 通过得到docurl,得到获取评论的首页url参数。
                articleId = self.r.parse('^http[s]?://www\.poocg\.com/works/view/(\d+)', params.originalurl)[0]
                # 取得总件数
                comment_count = float(self.r.parse(ur'<p><strong>(\d+)</strong><span>评论</span></p>', params.content)[0])
                NewsStorage.setcmtnum(params.originalurl, int(comment_count))
                if comment_count == 0:
                    return

                # 判断增量
                cmtnum = CMTStorage.getcount(params.originalurl, True)
                if cmtnum >= comment_count:
                    return


                # 获取页数
                page_num = int(math.ceil(float(comment_count - cmtnum) / poocgNewsComments.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages
                # 获得url列表
                for page in range(1, page_num + 1, 1):
                    url = poocgNewsComments.COMMENT_URL % (articleId, page)
                    self.storeurl(url, params.originalurl, poocgNewsComments.STEP_3)

                if NewsStorage.getclicknum(params.originalurl) <= 0:
                   clicknum = int(self.r.parse(ur'<p><strong>(\d+)</strong><span>浏览</span></p>', params.content)[0])
                   NewsStorage.setpublishdate(params.originalurl, clicknum)
                if NewsStorage.getfansnum(params.originalurl) <= 0:
                    fansnum = int(self.r.parse(ur'<p><strong>(\d+)</strong><span>喜欢</span></p>', params.content)[0])
                    NewsStorage.setpublishdate(params.originalurl, fansnum)
                publishdate = str(self.r.parse(ur'<p.*class="signed">(.*?)</p>', params.content)[0])
                NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate))

            elif params.step == poocgNewsComments.STEP_3:
                # Step3: 通过Step2设置的url,得到所有评论,抽取评论
                Logger.getlogging().info("params.step == 3")
                xparser = XPathUtility(params.content)
                # 取得所有评论
                soup = BeautifulSoup(params.content, 'html.parser')
                comments = soup.select('.p2')
                nicks = soup.select('.name')
                # 取得所有评论时间
                times = soup.select('.contentbox .time')

                commentsInfo = []
                # 取得所有评论
                for index in range(0, int(len(comments)), 1):
                    # 提取时间
                    # year = TimeUtility.getcurrentdate()[0:4]
                    # publictime= year + '年' + commenttimes[index].text
                    try:
                        if len(times)>0:
                            publictime = times[index].get_text()
                            curtime = TimeUtility.getuniformtime(publictime)
                        else:
                            curtime = ''
                    except:
                        curtime =''
                    content = comments[index].get_text()
                    try:
                        nick = str(nicks[index].get_text())
                    except:
                        nick = 'nickname'
                    if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content, curtime, nick)
                        #     if URLStorage.storeupdatetime(params.originalurl, tm):
                #         cmti = CommentInfo()
                #         cmti.content = comments[index].get_text()
                #         commentsInfo.append(cmti)
                #
                #     # 保存获取的评论
                # if len(commentsInfo) > 0:
                #     self.commentstorage.store(params.originalurl, commentsInfo)
            else:
                Logger.getlogging().error('proparam.step == {step}'.format(step = params.step))
        except Exception,e:
            traceback.print_exc()
    def step1(self, params):
        """获取评论的首页url"""
        try:
            comment_source_url = ''
            if self.r.search('http[s]{0,1}://.*tv\.sohu\.com.*',
                             params.originalurl):
                #对于电影,电视剧,搜狐手游取topic_source_id页面字段来源不同
                if self.r.search('^http://tv\.sohu\.com/\d{8}/n\d+\.shtml',
                                 params.originalurl):
                    topic_source_id = self.r.parse(
                        'var[\s]*vid[\s]*=[\s]*\"(.+?)\"', params.content)
                    if topic_source_id:
                        topic_source_id = topic_source_id[0]
                    else:
                        Logger.log(params.originalurl,
                                   constant.ERRORCODE_SITE_NOGET_COMMNETS)
                        return
                elif self.r.search('^http://my\.tv\.sohu\.com/.*.shtml',
                                   params.originalurl):
                    topic_source_id = self.r.parse('\d{1,}',
                                                   params.originalurl)[-1]
                    topic_source_id = 'bk' + topic_source_id
                else:
                    topic_source_id = self.r.getid('PLAYLIST_ID',
                                                   params.content)
                    if not topic_source_id:
                        topic_source_id = self.r.getid('playlistId',
                                                       params.content)
                    if not topic_source_id:
                        Logger.log(params.originalurl,
                                   constant.ERRORCODE_SITE_NOGET_COMMNETS)
                        return
                    topic_source_id = 'vp' + topic_source_id
                comment_source_url = self.TV_COMMENTS_SOURCE_URL.format(
                    self.tv_client_id, params.originalurl, topic_source_id,
                    self.tv_page_size)

            else:
                if self.r.parse('group', params.originalurl):
                    topic_source_id = \
                    self.r.parse('http[s]{0,1}://.*\.sohu\.com/group-(\d+)\.shtml.*', params.originalurl)[0]
                    comment_source_url = self.COMMENTS_SOURCE_URL.format(
                        self.client_id, self.group_mark + topic_source_id,
                        self.page_size)
                else:
                    topic_source_id = \
                    self.r.parse('http[s]{0,1}://.*\.sohu\.com/\d{8}/n(\d+)\.shtml.*', params.originalurl)[0]
                    comment_source_url = self.COMMENTS_SOURCE_URL.format(
                        self.client_id, topic_source_id, self.page_size)

            self.storeurl(comment_source_url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
            #http://tv.sohu.com/20170831/n600133376.shtml
            #http://tv.sohu.com/s2015/newslist/?vid=4016103  暂无法取得
            #对播放量进行检查,如果xpath没有获取到,使用代码通过api获取
            if NewsStorage.getclicknum(params.originalurl) <= 0:
                if re.search('^http://tv\.sohu\.com/\d{8}/n\d+\.shtml',
                             params.originalurl):
                    vid = self.r.getid('vid', params.content, split='=')
                    clickurl = self.TVCLICKURL.format(vid=vid)
                    self.storeurl(clickurl, params.originalurl,
                                  self.STEP_TVCLICK)
                elif re.search('^http://tv\.sohu\.com/.*vid=(\d+)',
                               params.originalurl):
                    vid = self.r.parse('^http://tv\.sohu\.com/.*vid=(\d+)',
                                       params.originalurl)[0]
                    clickurl = self.TVCLICKURL.format(vid=vid)
                    self.storeurl(clickurl, params.originalurl,
                                  self.STEP_TVCLICK)
                elif re.search('^http[s]{0,1}://my\.tv\.sohu\.com.*\.shtml$',
                               params.originalurl):
                    clickurl = self.MYTVCLICKURL.format(
                        vid=params.originalurl.split('/')[-1].split('.')[0])
                    self.storeurl(clickurl, params.originalurl,
                                  self.STEP_MYTVCLICK)
            if re.search('^http[s]{0,1}://my\.tv\.sohu\.com.*\.shtml$',
                         params.originalurl):
                if not params.content:
                    Logger.getlogging().debug("no params.content")
                if not self.r.search('uploadTime: \'(.*)?\'', params.content):
                    Logger.getlogging().debug("no params.content uploadTime")
                if self.r.search('uploadTime: \'(.*)?\'', params.content):
                    publishdate = self.r.parse('uploadTime: \'(.*)?\'',
                                               params.content)[0]
                    NewsStorage.setpublishdate(
                        params.originalurl,
                        TimeUtility.getuniformtime(publishdate))
        except:
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))