Beispiel #1
0
    def getpagecomments(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href')
        titles = xpath.getlist('//*[@class="sosResult"]/strong/a')
        pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformtime(
                    pubtimes[index].text).split(' ')[0]
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT)
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= int(self.querylastdays):
                    newurl = self.preprocess(hrefs[index])
                    if newurl is not None:
                        urllist.append(newurl)

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Beispiel #2
0
    def process(self, params):
        try:
            if params.step is AllComments.STEP_1:
                try:
                    action = self.r.parse('\"id\":(\d+)',params.content)[0]
                    #action = action.replace("rewards","comments")
                    comments_url = AllComments.COMMENTS_URL.format(id=action,page=1)
                    self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'action':action,'page':1})  
                except:
                    return
            elif params.step is AllComments.STEP_2:
                soup = BeautifulSoup(params.content,'html5lib')
                divs = soup.find_all(attrs={'id':re.compile('comment'),'class':re.compile('note-comment')})
                if not divs:
                    return
                for div in divs:
                    #print ''.join(div.get_text().split())
                    content = div.select_one('div.content > p').get_text()
                    pubtime = div.select_one('div.content > .meta-top > .reply-time > a').get_text()
                    curtime = TimeUtility.getuniformtime(pubtime)
                    if not CMTStorage.exist(params.originalurl, content, curtime, ''):
                        CMTStorage.storecmt(params.originalurl, content, curtime, '')

                    child_divs = div.select('.content > .child-comment-list > .child-comment')
                    if child_divs:
                        #child_divs = soup.find_all(attrs={'id':re.compile('comment')})
                        for item in child_divs:
                            comment = item.select_one('p').get_text()
                            content =  ''.join(comment.split())
                            replytime = item.select_one('.reply-time > a').get_text()
                            curtime = TimeUtility.getuniformtime(replytime)
                            if not CMTStorage.exist(params.originalurl, content, curtime, ''):
                                CMTStorage.storecmt(params.originalurl, content, curtime, '')
                # self.commentstorage.store(params.originalurl, comments)
                comments_url = AllComments.COMMENTS_URL.format(id=params.customized['action'], page = int(params.customized['page'])+1)
                self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, 
                                    {'action':params.customized['action'], 
                                    'page':int(params.customized['page'])+1}) 
        except:
            Logger.printexception()    
          
                    
                                
                                
                                
                            
                            
                
                
                
Beispiel #3
0
    def step2(self, params):
        Logger.getlogging().info("MkzhanComments.STEP_2")
        comic_id = params.customized['comic_id']
        # aboutid = params.customized['aboutid']
        comments = json.loads(params.content)
        comments_count = int(comments['data']['count'])
        cmtnum = CMTStorage.getcount(params.originalurl, True)
        # 获取第一页的内容
        for it in comments['data']['list']:
            content = it['content']
            curtime = TimeUtility.getuniformtime(it['create_time'])
            nick = it['username']
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)
        # 设置cmtnum
        NewsStorage.setcmtnum(params.originalurl, comments_count)

        if cmtnum >= comments_count:
            Logger.getlogging().warning(
                '{url}:30000 No comments'.format(url=params.originalurl))
            return
        page_num = int(
            math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
        if page_num >= self.maxpages:
            page_num = self.maxpages
        for page in range(2, page_num + 1, 1):
            comments_url = MkzhanComments.COMMENTS_URL % (comic_id, page,
                                                          self.PAGE_SIZE)
            self.storeurl(comments_url, params.originalurl,
                          MkzhanComments.STEP_3)
Beispiel #4
0
 def setclicknum(self,params):
     try:
         jsondate = json.loads(params.content)
         todayplaynum = jsondate['cmtVote']
         publishdate = jsondate['createTime']
         NewsStorage.setclicknum(params.originalurl, todayplaynum)
         NewsStorage.setpublishdate(params.originalurl, TimeUtility.getuniformtime(publishdate))
     except:
         Logger.printexception()
Beispiel #5
0
 def step3(self, params):
     Logger.getlogging().info("MkzhanComments.STEP_3")
     comments = json.loads(params.content)
     for it in comments['data']['list']:
         content = it['content']
         curtime = TimeUtility.getuniformtime(it['create_time'])
         nick = it['username']
         if not CMTStorage.exist(params.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(params.originalurl, content, curtime, nick)
    def process(self, params):
        try:
            if params.step is None:
                # 拼接第一页评论url
                comments_url = PcautoComments.COMMENTS_URL % (params.originalurl, 1, PcautoComments.PAGE_SIZE)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl, PcautoComments.STEP_2)

            #获取第一页评论内容,循环获取全部评论url
            elif params.step == PcautoComments.STEP_2:
                # 获取评论的Jason返回值
                comments = json.loads(params.content)
                # 获取评论页数
                comments_count = int(comments['total'])
                NewsStorage.setcmtnum(params.originalurl, comments_count)
                if comments_count == 0:
                    return
                # 判断增量
                cmtnum = CMTStorage.getcount(params.originalurl, True)
                if cmtnum >= comments_count:
                    return
                page_num = int(math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages
                # 循环拼接评论url,提交下载平台获取评论数据
                for page in range(1, page_num + 1, 1):
                    commentUrl = PcautoComments.COMMENTS_URL % (params.originalurl, page, PcautoComments.PAGE_SIZE)
                    self.storeurl(commentUrl, params.originalurl, PcautoComments.STEP_3)

            #解析评论数据
            elif params.step == PcautoComments.STEP_3:
                commentsinfo = json.loads(params.content)
                comments = []
                for comment in commentsinfo['data']:
                    updatetime = comment['createTime']
                    content = comment['content']
                    curtime = TimeUtility.getuniformtime(updatetime)
                    try:
                        nick = comment['nickName']
                    except:
                        nick = 'anonymous'

                    if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content, curtime, nick)
                #     if URLStorage.storeupdatetime(params.originalurl, updatetime):
                #         cmti = CommentInfo()
                #         cmti.content = comment['content']
                #         comments.append(cmti)
                # if len(comments) > 0:
                #     self.commentstorage.store(params.originalurl, comments)

        except Exception, e:
            traceback.print_exc()
 def geturlcomments(self, params):
     # 获取具体评论
     try:
         jsondata = json.loads(params.content)
         if jsondata['data']:
             for comment in jsondata['data']:
                 content = comment['content']
                 curtime = TimeUtility.getuniformtime(comment['createTime'])
                 nick = comment['nickName']
                 if not CMTStorage.exist(params.originalurl, content,
                                         curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content,
                                         curtime, nick)
     except:
         Logger.printexception()
 def getinfo(self, params):
     try:
         jsondata = json.loads(params.content)
         clicknum = jsondata['article']['readnum']
         votenum = jsondata['article']['praisenum']
         fansnum = jsondata['article']['favoritenum']
         publishtime = TimeUtility.getuniformtime(
             jsondata['article']['publishtime'])
         title = jsondata['article']['title']
         data = {}
         data = {
             "title": title,
             "clicknum": clicknum,
             "votenum": votenum,
             "fansnum": fansnum,
             "publishdate": publishtime
         }
         NewsStorage.seturlinfo(params.originalurl, '', '', data)
     except:
         Logger.printexception()
    def getyueduurlcomment(self, proparam):
        try:
            commentsinfo = json.loads(proparam.content)

            for key in commentsinfo['data']:
                # cmti = CommentInfo()
                # 得到标准日期格式
                #posttime = self.getdatetime(key['posttime'].strip())
                #if posttime is None:
                curtime = TimeUtility.getuniformtime(key['posttime'].strip())
                content = key['text']
                nick = key['username']
                if not CMTStorage.exist(proparam.originalurl, content, curtime,
                                        nick):
                    CMTStorage.storecmt(proparam.originalurl, content, curtime,
                                        nick)
        except:
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=proparam.url))
 def step2(self, params):
     try:
         soup = bs(params.content, 'html5lib')
         cmtsContainer = soup.find_all(
             attrs={'id': re.compile('readfloor_\d+')})
         timelist = []
         for cmtContainer in cmtsContainer:
             cmtContent = cmtContainer.find(attrs={
                 'class': 'f14 mb10'
             }).get_text()
             cmtPubDate = cmtContainer.find(attrs={
                 'class': 'tipTop s6'
             }).get_text()
             CMTStorage.storecmt(params.originalurl, cmtContent, cmtPubDate,
                                 '')
             timelist.append(TimeUtility.getuniformtime(cmtPubDate))
         if not self.isnewesttime(params.originalurl, min(timelist)):
             return True
         return False
     except:
         Logger.printexception()
    def step3tt(self, params):
        try:
            jsondata = json.loads(params.content)
            if jsondata:
                publishlist = [
                    TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT)
                ]
                try:
                    if jsondata == "ERROR_PARAMETER":
                        return
                    entitylist = jsondata['resultDO'].get('entityList', [])
                    for comment in entitylist:
                        content = self.strfilter(comment['body'])
                        #Jul 3, 2017 4:46:30 PM
                        curtime = comment['replyTime']
                        #此处时间格式
                        curtime = time.strftime(
                            '%Y-%m-%d %H:%M:%S',
                            time.strptime(curtime, '%b %d, %Y %H:%M:%S %p'))
                        nick = comment['userName']
                        publishlist.append(curtime)
                        if not CMTStorage.exist(params.originalurl, content,
                                                curtime, nick):
                            CMTStorage.storecmt(params.originalurl, content,
                                                curtime, nick)
                except:
                    Logger.printexception()
                    Logger.getlogging().error(
                        'extract no comment  from {site}'.format(
                            site=params.url))
                if not self.isnewesttime(params.originalurl, min(publishlist)):
                    return False
                return True

        except:
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))
Beispiel #12
0
    def step3(self, params):
        """"""
        try:
            jsondata = json.loads(params.content)
            comments = jsondata['comments']
        except:
            Logger.getlogging().warning('{}:30000 No comments'.format(
                params.originalurl))
            return

        cmts = []
        for key in comments:
            try:
                nickname = comments[key]['user']['nickname']
            except:
                nickname = 'anonymous'
            # 得到标准日期格式
            curtime = TimeUtility.getuniformtime(
                str(comments[key]['createTime']))
            content = comments[key]['content']
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nickname):
                CMTStorage.storecmt(params.originalurl, content, curtime,
                                    nickname)
Beispiel #13
0
    def process(self, params):
        try:
            if self.r.search('^http://news.mtime.com/.*', params.originalurl):
                if params.step is MtimeComments.STEP_1:
                    Logger.getlogging().info("MtimeComments.STEP_1")
                    topic_id = self.r.parse('^http://news.mtime.com/\d+/\d+/\d+\/(\d+)\.html', params.originalurl)[0]
                    # 1. 根据输入原始url, 拼出评论首页
                    commentinfo_url = MtimeComments.COMMENT_URL.format(topic_id=topic_id, page=0)
                    self.storeurl(commentinfo_url, params.originalurl, MtimeComments.STEP_2, {'topic_id': topic_id})
                elif params.step == MtimeComments.STEP_2:
                    Logger.getlogging().info("MtimeComments.STEP_2")
                    topic_id = params.customized['topic_id']
                    params.content = params.content.strip()[params.content.index('{'):params.content.index(';')]
                    commentsinfo = json.loads(params.content)
                    comments_count = commentsinfo['value']['totalCount']
                    if comments_count:
                        NewsStorage.setcmtnum(params.originalurl, comments_count)
                    cmtnum = CMTStorage.getcount(params.originalurl, True)
                    # 判断增量
                    if cmtnum >= comments_count:
                        return
                    page_num = int(
                        math.ceil(float(int(commentsinfo['value']['totalCount']) - cmtnum) / float(commentsinfo['value']['pageSize'])))
                    if page_num >= self.maxpages:
                        page_num = self.maxpages
                    for index in range(1, page_num + 1, 1):
                        commentinfo_url = MtimeComments.COMMENT_URL.format(topic_id=topic_id, page=index)
                        self.storeurl(commentinfo_url, params.originalurl, MtimeComments.STEP_3)
                elif params.step == MtimeComments.STEP_3:
                    Logger.getlogging().info("MtimeComments.STEP_3")
                    # Step3: 通过Step2设置的url,得到所有评论,抽取评论
                    params.content = params.content.strip()[params.content.index('{'):params.content.index(';')]
                    commentsinfo = json.loads(params.content)
                    # comments = []
                    # for index in range(0, int(len(commentsinfo['value']['comments'])), 1):
                    #     # 提取时间
                    #     cmti = CommentInfo()
                    #     cmti.content = commentsinfo['value']['comments'][index]['content']
                    #     tm = getuniformtime(commentsinfo['value']['comments'][index]['enterTime'])
                    #     if URLStorage.storeupdatetime(params.originalurl, tm):
                    #         comments.append(cmti)
                    # # 保存获取的评论
                    # if len(comments) > 0:
                    #     self.commentstorage.store(params.originalurl, comments)
                    for item in commentsinfo['value']['comments']:
                        content = item['content']
                        curtime = TimeUtility.getuniformtime(item['enterTime'])
                        nick = item['nickName']
                        if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                            CMTStorage.storecmt(params.originalurl, content, curtime, nick)

                else:
                    Logger.getlogging().error('proparam.step == {step}'.format(step = params.step))
                    return
            elif self.r.search('^http://people.mtime.com/.*', params.originalurl):
                if params.step is MtimeComments.STEP_1:
                    Logger.getlogging().info("MtimeComments.STEP_1")
                    cmtnum = CMTStorage.getcount(params.originalurl,True)
                    if cmtnum:
                        NewsStorage.setcmtnum(params.originalurl, cmtnum)
                    docId = self.r.parse('^http://people.mtime.com/(\d+)/', params.originalurl)[0]
                    # 1. 根据输入原始url, 拼出评论首页
                    commentinfo_url = MtimeComments.COMMENT_URL_PEOPLE1.format(docId=docId)
                    self.storeurl(commentinfo_url, params.originalurl, MtimeComments.STEP_2, {'docId': docId})
                elif params.step == MtimeComments.STEP_2:
                    Logger.getlogging().info("MtimeComments.STEP_2")
                    docId = params.customized['docId']
                    soup = BeautifulSoup(params.content, 'html5lib')
                    page = soup.select('.num')
                    self.storeurl(params.url, params.originalurl, MtimeComments.STEP_3)
                    for index in range(2, len(page) + 2, 1):
                        commentinfo_url = MtimeComments.COMMENT_URL_PEOPLE2.format(docId=docId, page=index)
                        self.storeurl(commentinfo_url, params.originalurl, MtimeComments.STEP_3)
                elif params.step == MtimeComments.STEP_3:
                    Logger.getlogging().info("MtimeComments.STEP_3")
                    # Step3: 通过Step2设置的url,得到所有评论,抽取评论
                    soup = BeautifulSoup(params.content, 'html5lib')
                    comments = soup.select('div.mod_short')
                    commentTimes = soup.select('span.fl')
                    nicks = soup.select('p.px14')
                    # commentInfo = []
                    for index in range(0, len(comments), 1):
                           # 提取时间
                    #     cmti = CommentInfo()
                    #     cmti.content = comments[index].get_text()
                    #     tm = getuniformtime(self.r.parse(u'entertime="(.+?)"', str(commentTimes[index + 1]))[0])
                    #     if URLStorage.storeupdatetime(params.originalurl, tm):
                    #         commentInfo.append(cmti)
                    # # 保存获取的评论
                    # if len(commentInfo) > 0:
                    #     self.commentstorage.store(params.originalurl, commentInfo)
                        content = comments[index].get_text().strip().replace('\s','')
                        curtime = TimeUtility.getuniformtime((self.r.parse(u'entertime="(.+?)"', str(commentTimes[index + 1]))[0]))
                        nick = nicks[index].get_text().strip()
                        if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                            CMTStorage.storecmt(params.originalurl, content, curtime, nick)
                else:
                    Logger.getlogging().error('proparam.step == {step}'.format(step = params.step))
                    return
        except Exception,e:
            traceback.print_exc()
Beispiel #14
0
    def process(self, params):
        try:
            if params.step is self.STEP_1:
                soup = BeautifulSoup(params.content, 'html5lib')
                body = soup.find(attrs={'class': 'post_message post_first'})
                if body:
                    NewsStorage.setbody(params.originalurl,
                                        body.get_text().strip())
                else:
                    Logger.getlogging().debug(
                        '{url}:30000!'.format(url=params.originalurl))
                keyvalue = params.url.split("/")[-1].split(".")[0]

                page = soup.select('.pager > a')

                if len(page) <= 2:
                    page = 1
                else:
                    page = page[-2].get_text()
                    page = int(re.findall('\d+', page)[0])

                if self.pagelimit:
                    if int(page) > self.pagelimit:
                        Logger.getlogging().warning(
                            'the pageMaxNumber is shutdown to {0}'.format(
                                self.pagelimit))
                        page = self.pagelimit

                for pg in range(1, int(page + 1)):
                    comments_url = self.COMMENTS_URL % (keyvalue + '-' +
                                                        str(pg))
                    self.storeurl(comments_url, params.originalurl,
                                  self.STEP_2, {
                                      'page': pg,
                                      'pagetotal': page
                                  })

            elif params.step is self.STEP_2:
                #self.get_comments(params)
                page = params.customized['page']
                soup = BeautifulSoup(params.content, 'html5lib')
                posts = soup.select('.post_wrap')
                if not posts:
                    Logger.getlogging().debug(
                        '{url}:30000!'.format(url=params.originalurl))
                    return
                for post in posts:
                    post_msg = post.select_one('.post_message').get_text()
                    post_msg = ''.join(post_msg.split())
                    # class ="user-42845238 post_time needonline " > 发表于 2017-07-27 23:53
                    post_time = post.find(
                        attrs={
                            'class': re.compile('user-.+post_time needonline')
                        }).get_text()
                    curtime = TimeUtility.getuniformtime(post_time)
                    content = post_msg.strip()
                    try:
                        # class ="user-40693231 needonline" > Akemi隅晖 < / a >
                        nick = post.find(
                            attrs={
                                'class': re.compile('user-.+ needonline')
                            }).get_text()
                    except:
                        nick = 'nickname'
                    if not CMTStorage.exist(params.originalurl, content,
                                            curtime, nick):
                        CMTStorage.storecmt(params.originalurl, content,
                                            curtime, nick)

        except:
            Logger.printexception()
Beispiel #15
0
    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is rayliComments.STEP_1:
                # 取得html中的commentType
                articleId = re.findall(
                    '^http://bbs\.rayli\.com\.cn/gallery-(\d+)-\d+.html',
                    proparam.url).__getitem__(0)

                #取得评论url
                comments_url = rayliComments.COMMENTS_URL % (articleId, 1)
                self.storeurl(comments_url, proparam.originalurl,
                              rayliComments.STEP_2, {
                                  'articleId': articleId,
                              })

            elif proparam.step == rayliComments.STEP_2:
                articleId = proparam.customized['articleId']
                # 取得评论个数
                comments_count = float(
                    re.findall(ur'回复:</span> (\d+)</div>',
                               proparam.content).__getitem__(0))
                if int(comments_count) == 0:
                    return

                # 判断增量
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if cmtnum >= comments_count:
                    return
                NewsStorage.setcmtnum(proparam.originalurl, comments_count)
                page_num = int(
                    math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages

                # 循环取得评论的url
                for page in range(1, page_num + 1, 1):
                    # 取得评论的url
                    url = rayliComments.COMMENTS_URL % (articleId, page)
                    self.storeurl(url, proparam.originalurl,
                                  rayliComments.STEP_3)

            elif proparam.step == rayliComments.STEP_3:
                commentsInfo = []
                soup = BeautifulSoup(proparam.content, 'html.parser')
                # 获取评论
                comments = soup.select('.t_f')
                # 获取评论时间
                commentTime = self.r.parse(
                    ur'<em id="authorposton\d+">发表于 (.+?)</em>',
                    proparam.content)
                # 获取nick
                nicks = soup.select('.xw1')

                # 是否首页
                page = int(
                    self.r.parse(ur'page=1-page-(\d+)', proparam.url)[0])
                if page == 1:
                    index = 1
                else:
                    index = 0
                publishlist = [
                    TimeUtility.getcurrentdate(TimeUtility.DEFAULTFORMAT)
                ]
                if len(comments) > 0:
                    # 获取评论
                    for index in range(index, len(comments), 1):
                        content = comments[index].text.strip()
                        curtime = commentTime[index]
                        nick = nicks[index].text
                        publishlist.append(curtime)
                        if not CMTStorage.exist(proparam.originalurl, content,
                                                curtime, nick):
                            CMTStorage.storecmt(proparam.originalurl, content,
                                                curtime, nick)
                        # cmti = CommentInfo()
                        # if URLStorage.storeupdatetime(proparam.originalurl, commentTime[index]):
                        #    cmti.content = comments[index].text
                        #    commentsInfo.append(cmti)
                if len(publishlist) > 0:
                    publishdate = min(publishlist)
                    NewsStorage.setpublishdate(proparam.originalurl,
                                               publishdate)

                # # 保存获取的评论
                # if len(commentsInfo) > 0:
                #     self.commentstorage.store(proparam.originalurl, commentsInfo)
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)

        except Exception, e:
            traceback.print_exc()