Beispiel #1
0
 def __storeurllist__(self, urllist, type=constant.SPIDER_S2_WEBSITE_VIDEO, referlist=[]):
     count = 0
     index = 0
     for url in urllist:
         params = PageBasicInfo()
         params.url = url
         params.type = type
         #检查是否在cold数据库中
         #如果不在cold数据库中则插入hot数据库中
         if not NewsStorage.exist_cold(url):
             NewsStorage.seturlinfos(params)
         #params = {constant.SPIDER_S2_WEBSITE_TYPE: type,
         #constant.SPIDER_CHANNEL: constant.SPIDER_CHANNEL_S1}            
         #url = url.strip()
         #if not URLManager.getinstance().exist(url):
         #count += 1
         #if referlist:
         #params[SiteS2Query.REFER_URL] = referlist[index]
         #urlcontext = URLContext()
         #urlcontext.url = url
         #urlcontext.type = URLContext.S1_MAIN_BODY
         #urlcontext.originalurl = url
         #urlcontext.customized = params
         #URLManager.getinstance().storeurl(url, urlcontext, constant.REQUEST_TYPE_WEBKIT) 
         index += 1
 def setclick(self, params):
     playcount = self.r.getid('play_count', params.content)
     votenum = self.r.getid('up', params.content)
     if playcount:
         NewsStorage.setclicknum(params.originalurl, playcount)
     if votenum:
         NewsStorage.setvotenum(params.originalurl, votenum)
 def ifengnews_step2(self, params):
     try:
         oriurl = params.customized['oriurl']
         jsoncontent = json.loads(params.content)
         clicknum = float(jsoncontent.get('join_count', '-1'))
         if clicknum > 0:
             NewsStorage.setclicknum(params.originalurl, clicknum)
         curcmtnum = float(jsoncontent['count'])
         NewsStorage.setcmtnum(params.originalurl, curcmtnum)
         dbcmtnum = CMTStorage.getcount(params.originalurl, True)
         if dbcmtnum >= curcmtnum:
             return
         # 循环取得评论的url
         pages = int(math.ceil(
             float(curcmtnum - dbcmtnum) / self.page_size))
         if pages >= self.maxpages:
             pages = self.maxpages
     # 拼出第一页之外的其他所有评论url
         for index in range(1, pages + 1, 1):
             if index == 1:
                 self.ifengnews_step3(params)
                 continue
             commentinfo_url = IfengNewsComments.COMMENTS_URL.format(
                 oriurl=oriurl, pg=index, ps=self.page_size)
             self.storeurl(commentinfo_url, params.originalurl,
                           IfengNewsComments.IFENG_NEWS_NEXT_PAGE)
     except:
         Logger.printexception()
Beispiel #4
0
    def step2(self, params):
        try:
            Logger.getlogging().info("xinhuaComments.STEP_2")
            # 将STEP_1中的commentinfo_url传下来
            newsId = params.customized['newsId']
            comments_info = json.loads(params.content)
            comments_count = comments_info['totalRows']
            NewsStorage.setcmtnum(params.originalurl, comments_count)
            page_count = comments_info['totalPage']

            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl, True)
            if cmtnum >= comments_count:
                return

            # 判断增量
            if page_count >= self.maxpages:
                page_count = self.maxpages

            for index in range(0, int(page_count)):
                commentinfo_url = xinhuaNewsComments.COMMENTS_URL_NEWS.format(
                    newsId=newsId, pid=(index + 1))
                self.storeurl(commentinfo_url, params.originalurl,
                              xinhuaNewsComments.STEP_3)
        except:
            Logger.printexception()
    def setp_3(self, params):
        # 取得评论件数
        comments = json.loads(params.content)
        comments_count = float(comments['total'])
        NewsStorage.setcmtnum(params.originalurl, comments_count)
        if int(comments_count) == 0:
            return
        # 判断是否有增量
        cmtnum = CMTStorage.getcount(params.originalurl, True)
        if cmtnum >= comments_count:
            return
        page_num = int(
            math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
        if page_num >= self.maxpages:
            page_num = self.maxpages

        # 取得评论参数
        cid = params.customized['cid']
        xid = params.customized['xid']
        pid = params.customized['pid']

        # 取得评论url列表
        for page in range(1, page_num + 1, 1):
            if page == 1:
                self.getcomments(params)
                continue
            url = LeComments.COMMENTS_URL % (cid, page, xid, pid)
            self.storeurl(url, params.originalurl, LeComments.STEP_4)
Beispiel #6
0
 def setclick(self, params):
     soup = BeautifulSoup(params.content, 'html5lib')
     #电视剧
     itemcount = soup.select('.mod_episode > .item')
     if itemcount:
         total = self.str2num(
             soup.select_one('#mod_cover_playnum').get_text())
         clicknum = total / len(itemcount)
         NewsStorage.setclicknum(params.originalurl, clicknum)
         return
     #其他
     parentid = params.originalurl.split('.')[-2].split('/')[-1]
     #figures_list = soup.find_all(attrs={'class':re.compile('^figures?_list$')})
     for fitem in soup.find_all(
             attrs={'class': re.compile('^figures?_list$')}):
         #list_items = fitem.find_all(attrs={'class':re.compile('list_item')})
         for item in fitem.find_all(
                 attrs={'class': re.compile('list_item')}):
             childurl = item.select_one('a').get('href', None)
             childid = childurl.split('.')[-2].split('/')[-1]
             #Logger.getlogging().debug('childid:'+childid+'\t'+'parentid:'+parentid)
             if childid == parentid:
                 numobj = item.find(attrs={
                     'class':
                     re.compile('num _video_playnum|figure_num')
                 })
                 if not numobj:
                     continue
                 clicknum = self.str2num(numobj.get_text())
                 NewsStorage.setclicknum(params.originalurl, clicknum)
                 return
Beispiel #7
0
 def step1(self, params):
     try: 
         url = params.originalurl
         videoId = params.customized['videoId']
         params.content = params.content[params.content.index('{'):params.content.rindex('}')+1]
         jsonData = json.loads(params.content)['data']
         hasCmts = jsonData['page']['count']
         # 是否有评论
         if not hasCmts:
             return
         # 比较增量:获取现在评论数目及上次采集的评论数目
         currCmtsCount = jsonData['page']['acount']
         NewsStorage.setcmtnum(url, currCmtsCount)
         prevCmtsCount = int(CMTStorage.getcount(url))
         # 若没有评论更新,跳过
         if prevCmtsCount >= currCmtsCount:
             return
         # 更新评论数
         pageNum = int(math.ceil((hasCmts-prevCmtsCount)/self.pageSize))
         # 上次采集数据的截止时间         
         # 生成分页url并传递给共通模块
         for page in range(1, pageNum + 1):
             if page == 1:
                 self.step2(params)
             pageUrl = self.pageUrl.format(page = page, videoId = videoId)
             self.storeurl(pageUrl, url, self.STEP_CMTS)
     except:
         Logger.printexception()
    def getcomments_step2(self, params):
        bookId = params.customized['bookId']
        xhtml = XPathUtility(html=params.content)
        page_counts = int(xhtml.xpath('//div[@class="page"]/@pagenum')[0])
        comments_count = int(xhtml.xpath('//div[@class="page"]/@total')[0])
        Logger.getlogging().debug(comments_count)
        if page_counts == 0:
            return
        # 判断增量
        cmtnum = CMTStorage.getcount(params.originalurl, True)
        if cmtnum >= comments_count:
            return

        page_num = int(
            math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
        if page_num >= self.maxpages:
            page_num = self.maxpages
        NewsStorage.setcmtnum(params.originalurl, comments_count)

        for page in range(1, page_num + 1, 1):
            comment_url = PubComments.COMMENTS_URL
            self.storeposturl(comment_url, params.originalurl,
                              PubComments.STEP_3, {
                                  'bookId': bookId,
                                  'pageNum': page
                              })
Beispiel #9
0
 def step1(self, params):
     # 取得url中的id
     articleId = self.r.parse(r'^https://movie\.douban\.com/\w+/(\d+)',
                              params.url)[0]
     # 取得评论件数
     xpathobj = XPathUtility(params.content)
     text = xpathobj.getstring(
         xpath='//*[@id="comments-section"]//h2/*[@class="pl"]/a')
     numtext = self.r.parse('\d+', text)
     if not numtext:
         return
     curcmtnum = float(numtext[0])
     NewsStorage.setcmtnum(params.originalurl, curcmtnum)
     dbcmtnum = CMTStorage.getcount(params.originalurl, True)
     if dbcmtnum >= curcmtnum:
         return
     # 循环取得评论的url
     pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE))
     if pages >= self.maxpages:
         pages = self.maxpages
     for page in range(1, pages + 1, 1):
         url = doubanComments.COMMENTS_URL.format(articleId=articleId,
                                                  start=(page - 1) *
                                                  self.PAGE_SIZE,
                                                  pagesize=self.PAGE_SIZE)
         self.storeurl(url, params.originalurl, doubanComments.STEP_2)
    def setclick(self, params):
        try:
            content = json.loads(params.content)
            # content=[播放量,评论,X,X,弹幕,收藏数,投焦数,X]
            cmtnum = content[1]
            clicknum = content[0]
            votenum = content[-2]
            fansnum = content[-3]
            if not cmtnum:
                cmtnum = 0
            if not clicknum:
                clicknum = 0
            if not votenum:
                votenum = 0
            if not fansnum:
                fansnum = 0
            NewsStorage.seturlinfo(params.originalurl,
                                   data={
                                       SQLDAO.SPIDER_TABLE_NEWS_CMTNUM: cmtnum,
                                       SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM:
                                       clicknum,
                                       SQLDAO.SPIDER_TABLE_NEWS_VOTENUM:
                                       votenum,
                                       SQLDAO.SPIDER_TABLE_NEWS_FANSNUM:
                                       fansnum
                                   })

        except:
            Logger.printexception()
    def step2(self, params):
        qitanid = params.customized['qitanid']
        tvid = params.customized['tvid']
        comments = json.loads(params.content)
        curcmtnum = float(comments['data']['count'])
        NewsStorage.setcmtnum(params.originalurl, curcmtnum)
        dbcmtnum = CMTStorage.getcount(params.originalurl, True)
        if dbcmtnum >= curcmtnum:
            return
# 循环取得评论的url
        pages = int(
            math.ceil(float(curcmtnum - dbcmtnum) / self.DEFAULT_PAGE_SIZE))
        if pages >= self.maxpages:
            pages = self.maxpages

        for page in range(1, pages + 1, 1):
            if int(qitanid):
                url = IqiyiComments.COMMENTS_URL1.format(
                    pageno=page,
                    pagesize=IqiyiComments.DEFAULT_PAGE_SIZE,
                    qitanid=qitanid,
                    tvid=tvid)
            else:
                url = IqiyiComments.COMMENTS_URL2.format(
                    pageno=page,
                    pagesize=IqiyiComments.DEFAULT_PAGE_SIZE,
                    tvid=tvid)
            self.storeurl(url, params.originalurl, IqiyiComments.STEP_3)
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is jiemianComments.STEP_1:
                # 取得url中的id
                articleId = re.findall(r'^http://www\.jiemian\.com/\w+/(\d+)', proparam.url).__getitem__(0)
                # 设置clicknum
                self.setclick(proparam)
                # 取得评论个数
                comments_count = float(re.findall(r'"comment_count">(\d+)</span>', proparam.content).__getitem__(0))
                if comments_count:
                    NewsStorage.setcmtnum(proparam.originalurl, comments_count)
                # 取得评论件数
                if int(comments_count) == 0:
                    return

                # 增量判断
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if cmtnum >= comments_count:
                    return
                page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages
                # 循环取得评论的url
                for page in range(1, page_num + 1, 1):
                    url = jiemianComments.COMMENTS_URL % (articleId, page)
                    self.storeurl(url, proparam.originalurl, jiemianComments.STEP_3)
            elif proparam.step == jiemianComments.STEP_3:
                # proparam.content = proparam.content.replace('\\','')
                # soup = BeautifulSoup(proparam.content, 'html5lib')
                # items = soup.select('.comment-post')
                # for item in items:
                #     content = item.select_one('.comment-main > p').get_text().encode('utf-8')
                #     curtime = TimeUtility.getuniformtime(item.select_one('.date').get_text())
                #     nick = item.select_one('.author-name').get_text().decode('utf-8').encode('utf-8')
                # 取得点赞数
                votenum = self.r.getid('ding', proparam.content)
                if votenum == '':
                    Logger.getlogging().debug("Unable to get playcount")
                else:
                    NewsStorage.setvotenum(proparam.originalurl, votenum)
                # 取得评论的正则表达式
                comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content)
                ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>',proparam.content)
                nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content)

                # 取得评论
                for index in range(0,len(comments)):
                    time = ctime[index].replace('\\', '')
                    curtime = TimeUtility.getuniformtime(time)
                    content = eval('u"' + comments[index] + '"').encode('utf-8')
                    nick = eval('u"' + nicks[index] + '"').encode('utf-8')
                    if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
                        CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)


        except Exception, e:
            traceback.print_exc()
Beispiel #13
0
 def step1(self, params):
     pattern = 'https://www.huxiu.com/article/(\d+).html'
     if not self.r.search(pattern, params.originalurl):
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE)
         return
     else:
         object_id = self.r.parse(pattern, params.originalurl)[0]
     curcmtnum = XPathUtility(
         params.content).getnumber('//*[@class="article-pl pull-left"]')
     if not curcmtnum:
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     NewsStorage.setcmtnum(params.originalurl, curcmtnum)
     dbcmtnum = CMTStorage.getcount(params.originalurl, True)
     if dbcmtnum >= curcmtnum:
         return
     # 循环取得评论的url
     pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size))
     if pages >= self.maxpages:
         pages = self.maxpages
     for page in range(1, pages + 1):
         #self.POST_DATA['object_id'] = object_id
         #self.POST_DATA['page'] = page
         #self.storeposturl(self.POST_URL, params.originalurl, HuxiupostComments.EACH, self.POST_DATA)
         commonurl = self.COMMONURL.format(object_id=object_id, page=page)
         self.storeurl(commonurl, params.originalurl,
                       HuxiupostComments.EACH)
    def step2(self, params):
        """获取评论的其他url"""
        try:
            comments = json.loads(params.content)
            topic_id = comments['topic_id']
            curcmtnum = float(comments.get('cmt_sum', -1))
            #clicknum = float(comments.get('participation_sum',-1))
            NewsStorage.setcmtnum(params.originalurl, curcmtnum)
            #NewsStorage.setclicknum(params.originalurl, clicknum)

            dbcmtnum = CMTStorage.getcount(params.originalurl, True)
            if dbcmtnum >= curcmtnum:
                return
            page_num = int(
                math.ceil(float(curcmtnum - dbcmtnum) / self.page_size))
            if page_num >= self.maxpages:
                page_num = self.maxpages
            for page in range(1, page_num + 1):
                if self.r.search('http[s]{0,1}://.*tv\.sohu.com/.*',
                                 params.originalurl):
                    url = self.COMMENTS_URL.format(self.tv_client_id, topic_id,
                                                   page, self.tv_page_size)
                else:
                    url = self.COMMENTS_URL.format(self.client_id, topic_id,
                                                   page, self.page_size)
                self.storeurl(url, params.originalurl,
                              self.STEP_COMMENT_NEXT_PAGE)
        except:
            Logger.printexception()
            Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
Beispiel #15
0
    def step2_ebook(self, params):
        try:
            #"""只适用在QQ阅读部分,获取评论的url列表"""
            bid = params.customized['bid']
            jsoncontent = json.loads(params.content)
            if not jsoncontent.has_key('data'):
                Logger.log(params.originalurl,
                           constant.ERRORCODE_SITE_NOGET_COMMNETS)
                return
            comments_count = jsoncontent['data']['total']
            page_count = jsoncontent['data']['pageCount']
            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl, True)
            NewsStorage.setcmtnum(params.originalurl, comments_count)
            if cmtnum >= comments_count:
                return

            # 判断10页
            if int(page_count) >= self.maxpages:
                page_count = self.maxpages

            for page in range(1, page_count + 1, 1):
                commentinfo_url = self.EBOOK_COMMENTS_URL.format(site='intro',
                                                                 bid=bid,
                                                                 page=page)
                self.storeurl(commentinfo_url, params.originalurl,
                              self.STEP_COMMENT_NEXT_PAGE)
        except Exception, e:
            Logger.printexception()
Beispiel #16
0
 def step2bbs(self, params):
     Logger.getlogging().info("JoyComments.STEP_2")
     topic_id = params.customized['topic_id']
     domain = params.customized['domain']
     try:
         commentsinfo = json.loads(params.content)
         comments_count = commentsinfo['result']['mainreplys']['page'][
             'totalRows']
         NewsStorage.setcmtnum(params.originalurl, comments_count)
     except:
         Logger.getlogging().warning(
             '{url} Errorcode:40000'.format(url=params.originalurl))
         #Logger.printexception()
         return
     # 保存页面评论量
     cmtnum = CMTStorage.getcount(params.originalurl, True)
     # 计算增量
     if cmtnum >= comments_count:
         return
     page_num = int(
         math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
     if page_num >= self.maxpages:
         page_num = self.maxpages
     for index in range(1, page_num + 1, 1):
         commentinfo_url = JoyComments.COMMENT_URL.format(topic_id=topic_id,
                                                          domain=domain,
                                                          page=index)
         self.storeurl(commentinfo_url, params.originalurl,
                       JoyComments.STEP_3_BBS)
Beispiel #17
0
    def get_url_id(self, params):
        """只适用在腾讯视频的部分"""
        "cid是电视剧\合集\电影,vid单集"
        CID_PATTERN = 'https?://v\.qq\.com/x/cover/(\w+).html'
        CID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid={cid}'
        VID_PATTERN1 = 'https?://v\.qq\.com/x/cover/\w+/(\w+).html'
        VID_PATTERN2 = 'https?://v\.qq\.com/x/page/(\w+)\.html'
        VID_URL = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&vid={vid}'

        if self.r.search(CID_PATTERN, params.originalurl):
            cid = self.r.parse(CID_PATTERN, params.originalurl)[0]
            url = CID_URL.format(cid=cid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        elif self.r.search(VID_PATTERN1, params.originalurl):
            vid = self.r.parse(VID_PATTERN1, params.originalurl)[0]
            url = VID_URL.format(vid=vid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        elif self.r.search(VID_PATTERN2, params.originalurl):
            vid = self.r.parse(VID_PATTERN2, params.originalurl)[0]
            url = VID_URL.format(vid=vid)
            self.storeurl(url, params.originalurl,
                          self.STEP_COMMENT_FIRST_PAGE)
        #publish_date
        publish_date = self.r.getid('publish_date', params.content, split=':')
        if not publish_date:
            publish_date = XPathUtility(params.content).getstring(
                '//*[@class="video_tags"]/span|//*[@class="date"]|//*[@class="tag_item"]'
            )
            publish_date = TimeUtility.getuniformtime(publish_date)
        if publish_date:
            NewsStorage.setpublishdate(params.originalurl, publish_date)
        self.setclick(params)
Beispiel #18
0
    def step2(self, params):
        """"""
        try:
            threadId = params.customized['threadId']
            jsondata = json.loads(params.content)
            comment_totalnum = jsondata['tcount']
            NewsStorage.setcmtnum(params.originalurl, comment_totalnum)
        except:
            Logger.getlogging().warning('{}:30000 No comments'.format(
                params.originalurl))
            return
            #更新数据库
            cmtnum = CMTStorage.getcount(params.originalurl, True)
        if cmtnum >= int(comment_totalnum):
            return

            max = int(
                math.ceil(float(comment_totalnum - cmtnum) / VComments.limit))
        if max > self.maxpages:
            max = self.maxpages
        #if offsets == 1:
        #self.step3(params)
        for offset in range(1, max + 1, 1):
            if page == 1:
                self.step3(params)
                continue
            comment_url = VComments.COMMENT_URL.format(threadId=threadId,
                                                       limit=VComments.limit,
                                                       offset=offset *
                                                       VComments.limit)
            self.storeurl(comment_url, params.originalurl, VComments.V_STEP_3,
                          {'threadId': threadId})
Beispiel #19
0
    def step1(self, params):
        Logger.getlogging().info("ToutiaoNewsComments.STEP_1")
        group_id = self.r.parse('http://www.toutiao.com/(\w+)/.*',params.originalurl)[0]
        if group_id:
            group_id = group_id[1:]
        # group_id = self.r.getid("groupId", params.content)
        # item_id = self.r.getid("itemId", params.content)
        # if not group_id:
        #     group_id = self.r.getid("group_id", params.content)
        #     item_id = self.r.getid("item_id", params.content)

        try:
            publishdate = self.r.getid("time", params.content)
            if not publishdate:
                publishdate = self.r.getid("publish_time", params.content)
            if publishdate:
                NewsStorage.setpublishdate(params.originalurl,publishdate)
        except:
            Logger.getlogging().error('{0}:30000'.format(params.originalurl))
        if not group_id :
            Logger.getlogging().error('{0}:30000'.format(params.originalurl))
            return

        # commentinfo_url = ToutiaoNewsComments.COMMENTS_URL.format(group_id, item_id, 1, self.page_size)
        commentinfo_url = ToutiaoNewsComments.COMMENTS_URL.format(group_id, 0, self.page_size)
        self.storeurl(commentinfo_url, params.originalurl, ToutiaoNewsComments.STEP_2,{'group_id':group_id})
 def getComments(self, params, url):
     # 当前评论页码
     pg = self.r.parse(url, params.url)[0]
     soup = BeautifulSoup(params.content, 'html5lib')
     # 帖子内容
     infos = soup.select('tr > td.postcontent')
     # 发表时间,内容格式[发表于 2016-10-7 18:04:25]
     comments = []
     # 第一页的第一条内容为正文
     if pg == '1':
         start = 1
     else:
         start = 0
     for info in infos[start:]:
         # 取主评论
         if info.select_one('div[class="postmessage defaultpost"]'):
             content = info.select_one('div[class="postmessage defaultpost"]').get_text()\
                 .replace('\t','').replace('\n','').replace(' ','').strip()
             updatetime = info.select_one(
                 'div.postinfo > font').get_text().strip()[4:] + ':00'
             curtime = getuniformtime(updatetime)
             nick = 'none'
             if not CMTStorage.exist(params.originalurl, content, curtime,
                                     nick):
                 CMTStorage.storecmt(params.originalurl, content, curtime,
                                     nick)
     comments_couts = CMTStorage.getcount(params.originalurl)
     NewsStorage.setcmtnum(params.originalurl, comments_couts)
 def step2(self, params):
     operaId = params.customized['operaId']
     contentId = params.customized['contentId']
     # 获取评论的Jason返回值
     comments = json.loads(params.content)
     # 获取评论页数
     curcmtnum = int(comments['pageTurn']['rowCount'])
     NewsStorage.setcmtnum(params.originalurl, curcmtnum)
     dbcmtnum = CMTStorage.getcount(params.originalurl, True)
     if dbcmtnum >= curcmtnum:
         return
     # 循环取得评论的url
     pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE))
     if pages >= self.maxpages:
         pages = self.maxpages
     # 循环拼接评论url,提交下载平台获取评论数据
     for page in range(1, pages + 1, 1):
         if page == 1:
             self.step3(params)
         commentUrl = Comments.COMMENTS_URL % (operaId, contentId, page,
                                               Comments.PAGE_SIZE)
         self.storeurl(commentUrl, params.originalurl, Comments.STEP_3, {
             'operaId': operaId,
             'contentId': contentId
         })
Beispiel #22
0
 def process(self, params):
     # S2 Query Process
     if SPIDER_CHANNEL_S2 == SpiderConfigure.getinstance().getchannel():
         if SPIDER_S2_WEBSITE_TYPE not in params.customized:
             return True
     xparser = XPathUtility(params.content)
     maxitmes = 0
     pageinfo = PageBasicInfo()
     template = None
     for template in TemplateManager.getxpaths(params.url):
         Logger.getlogging().debug('URL_TEMPLATE {url}\t{template}'.format(
             url=params.url,
             template=template[TemplateManager.XPATH_KEY_URL_TEMPLATE]))
         pageinfo, items = self.parsefromcontent(params, template, xparser)
         if constant.SPIDER_S2_WEBSITE_TYPE in params.customized:
             pageinfo.type = params.customized[
                 constant.SPIDER_S2_WEBSITE_TYPE]
     #if not params.page_title and not pageinfo.title and not params.lastretry:
     #return False
     if template is None:
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE)
     #值覆盖
     pageinfo.url = params.url
     if not pageinfo.title:
         pageinfo.title = params.page_title
     if not pageinfo.body:
         pageinfo.body = params.page_body
     if not pageinfo.pubtime:
         pageinfo.pubtime = params.html_time
     NewsStorage.seturlinfos(pageinfo)
    def setp_2(self, params):
        # 取得评论件数
        comments = json.loads(params.content)
        comments_count = float(comments['total'])
        NewsStorage.setcmtnum(params.originalurl, comments_count)
        if int(comments_count) == 0:
            return
        # 判断是否有增量
        cmtnum = CMTStorage.getcount(params.originalurl, True)
        if cmtnum >= comments_count:
            return
        page_num = int(
            math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
        if page_num >= self.maxpages:
            page_num = self.maxpages

        # 取得评论参数
        pid = params.customized['pid']

        # 综艺模式取得评论url
        if re.match(r'^http://zongyi\.le\.com/.*', params.url):
            for page in range(1, page_num + 1, 1):
                if page == 1:
                    self.geturlcomments(params)
                    continue
                url = LeComments.COMMENTS_URL_ZONGYI1 % (page, pid)
                self.storeurl(url, params.originalurl, LeComments.STEP_4)
        else:
            for page in range(1, page_num + 1, 1):
                if page == 1:
                    self.geturlcomments(params)
                    continue
                url = LeComments.COMMENTS_URL_TV % (page, pid)
                self.storeurl(url, params.originalurl, LeComments.STEP_4)
Beispiel #24
0
 def step2(self, params):
     jsondata = json.loads(params.content)
     if 'thread' not in jsondata:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     threadid = jsondata['thread']['thread_id']
     curcmtnum = int(jsondata['cursor']['total'])
     # 检查是否有评论数,没有,返回
     if int(curcmtnum) == 0:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     # 检查评论数是否增加,没有增加,返回;有增加,更新增加后的页面评论量
     curcmtnum = int(curcmtnum)
     NewsStorage.setcmtnum(params.originalurl, curcmtnum)
     dbcmtnum = CMTStorage.getcount(params.originalurl, True)
     if dbcmtnum >= curcmtnum:
         return
     pages = int(
         math.ceil(float(curcmtnum - dbcmtnum) / self.DEFAULT_PAGE_SIZE))
     if pages >= self.maxpages:
         pages = self.maxpages
     for page in range(1, pages + 1, 1):
         url = NarutomVideoComments.COMMENTS_URL.format(
             threadid=threadid,
             limit=NarutomVideoComments.DEFAULT_PAGE_SIZE,
             page=page)
         self.storeurl(url, params.originalurl, NarutomVideoComments.STEP_3)
    def getclick(self, params):
        pattern = 'https?://\w+\.le\.com.*/\w+/(\d+)\.html'
        if re.search(pattern, params.originalurl):
            if self.r.search(pattern, params.originalurl):
                vid = self.r.parse(pattern, params.originalurl)[0]
                playcount_url = self.PALYCOUNT_URL.format(vid=vid)
                self.storeurl(playcount_url, params.originalurl,
                              LeComments.STEP_PALY)

        if NewsStorage.getpublishdate(
                params.originalurl) == TimeUtility.getintformtime(0):
            if self.r.search('https?://sports\.le\.com/video/\d+\.html',
                             params.originalurl):
                #仅针对体育频道获取发布时间
                pubTime = XPathUtility(
                    params.content).getstring('//*[@class="live-vedio-infor"]')
                publishdate = TimeUtility.getuniformtime(publishdate)
                NewsStorage.setpublishdate(params.originalurl, publishdate)
            else:
                #仅针对综艺频道获取发布时间
                title = XPathUtility(params.content).getstring(
                    '//h1[@class="j-video-name video-name"]')
                if title:
                    if re.search('\d{8}', title):
                        publishdate = re.findall('\d{8}', title)[0]
                        NewsStorage.setpublishdate(params.originalurl,
                                                   publishdate)
    def step2(self, params):
	# 取得client_id
	liteloadApi  = params.customized['liteloadApi']
	client_id  = params.customized['client_id']
	topic_url  = params.customized['topic_url']
	commentsApi = params.customized['commentsApi']
	# 取得评论个数
	content = json.loads(params.content)
	curcmtnum = float(content.get('cmt_sum',0))
	NewsStorage.setcmtnum(params.originalurl, curcmtnum) 
	dbcmtnum = CMTStorage.getcount(params.originalurl, True)
	if dbcmtnum >= curcmtnum:
	    return
	# 取得topicId
	topic_id = content.get('topic_id','')
	if not topic_id:
	    Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
	    return
	# 循环取得评论的url
	pages = int(math.ceil(float(curcmtnum - dbcmtnum) / ChangyanComments.PAGE_SIZE))
	if pages >= self.maxpages:
	    pages = self.maxpages
	for page in range(1, pages + 1, 1):
	    # 取得评论的url
	    #COMMENTS_URL = 'http://changyan.sohu.com/api/{commentsApi}/topic/comments?client_id={client_id}&page_no={page_no}&page_size={page_size}&topic_id={topic_id}'	    
	    url = ChangyanComments.COMMENTS_URL.format(commentsApi=commentsApi,
	                                               client_id=client_id, 
	                                               page_no = page,
	                                               page_size = ChangyanComments.PAGE_SIZE,
	                                               topic_id=topic_id,
	                                               )
	    self.storeurl(url, params.originalurl, ChangyanComments.STEP_3)
    def step2_ifeng_xiaobg(self, params):
        try:
            jsoncontent = json.loads(params.content)
            clicknum = float(jsoncontent.get('join_count', '-1'))
            if clicknum > 0:
                NewsStorage.setclicknum(params.originalurl, clicknum)
            curcmtnum = jsoncontent['count']
            NewsStorage.setcmtnum(params.originalurl, curcmtnum)
            dbcmtnum = CMTStorage.getcount(params.originalurl, True)
            if dbcmtnum >= curcmtnum:
                return
# 循环取得评论的url
            pages = int(math.ceil(
                float(curcmtnum - dbcmtnum) / self.page_size))
            if pages >= self.maxpages:
                pages = self.maxpages
            for index in range(1, pages + 1, 1):
                if index == 1:
                    self.ifengnews_step3(params)
                    continue
                self.post_data['p'] = index
                self.storeposturl(self.post_url, params.originalurl,
                                  self.IFENG_NEWS_NEXT_PAGE,
                                  IfengNewsComments.post_data)
        except:
            Logger.printexception()
Beispiel #28
0
    def step2_ac(self, params):
        """只适用在腾讯动漫视频部分,获取评论的url列表"""
        url_id = params.customized['url_id']
        xhtml = etree.HTML(params.content)
        # 评论数量获取经常会参数错误
        comments_count = xhtml.xpath(
            '//*[@id="pagination-node"]/span/em/text()')
        if comments_count:
            comments_count = int(comments_count[0])
        else:
            Logger.log(params.originalurl,
                       constant.ERRORCODE_SITE_NOGET_COMMNETS)
            return
        page_size = len(xhtml.xpath('//*[@class="comment-content-detail"]'))
        # 判断增量
        cmtnum = CMTStorage.getcount(params.originalurl, True)
        NewsStorage.setcmtnum(params.originalurl, comments_count)
        if cmtnum >= comments_count:
            return

        page_num = int(math.ceil((float(comments_count) / page_size)))
        if int(page_num) >= self.maxpages:
            page_num = self.maxpages
        for page in range(1, page_num + 1):
            url = self.AC_COMMENTS_URL.format(url_id, page)
            self.storeurl(url, params.originalurl, self.STEP_COMMENT_NEXT_PAGE)
    def step2news(self, params):
        curcmtnum = self.r.parse("\"total\"\:(\d+)", params.content)[0]
        page_count = self.r.parse("\"total_page\"\:(\d+)", params.content)[0]
        threadid = self.r.parse("\"thread_id\"\:\"(\d+)\"", params.content)[0]
        objectid = self.r.parse("\"object_id\"\:\"(\d+)\"", params.content)[0]
        curcmtnum = int(curcmtnum)
        NewsStorage.setcmtnum(params.originalurl, curcmtnum)
        dbcmtnum = CMTStorage.getcount(params.originalurl, True)
        if dbcmtnum >= curcmtnum:
            return
# 循环取得评论的url
        pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.page_size))
        if pages >= self.maxpages:
            pages = self.maxpages
        for page in range(1, pages + 1, 1):
            if page == 1:
                self.step3news(params)
                continue
            comment_url = U17NewsComments.COMMENT_URL_NEWS.format(
                threadid=threadid,
                objectid=objectid,
                page=page,
                pagesize=self.page_size,
                comicid=objectid)
            self.storeurl(comment_url, params.originalurl,
                          U17NewsComments.STEP_3)
Beispiel #30
0
    def step1(self, params):
        Logger.getlogging().info("MkzhanComments.STEP_1")
        # # 取得html中的commentType
        # comment_type = self.r.getid('commentType', params.content)
        #
        # # 取得html中的aboutid
        # aboutid = self.r.getid('aboutid', params.content)
        # if not comment_type  or not aboutid:
        #     Logger.getlogging().warning('{url}:40000 No commentType or No aboutid'.format(url=params.originalurl))
        #     return
        if NewsStorage.getclicknum(params.originalurl) <= 0:
            if self.r.search('<span>人气:\s<b>(.*?)<\/b>', params.content):
                clicknum = self.r.parse('<span>人气:\s<b>(.*?)<\/b>',
                                        params.content)[0]
                NewsStorage.setclicknum(params.originalurl, clicknum)

                # 获取comic_id
        comic_id = int(
            self.r.parse(r'^http[s]?://www\.mkzhan\.com/(\d+)/.*',
                         params.originalurl)[0])
        if not comic_id:
            return
        # 取得评论url
        comments_url = MkzhanComments.COMMENTS_URL % (comic_id, 1,
                                                      self.PAGE_SIZE)
        self.storeurl(comments_url, params.originalurl, MkzhanComments.STEP_2,
                      {'comic_id': comic_id})