Python XPathUtility.getcommentsの例、utility.xpathutil.XPathUtility.getcomments Pythonの例

コード例 #1

0

ファイルを表示

    def step3news(self, params):
        Logger.getlogging().info("ZolbbsComments.STEP_3")
        # Step3: 通过Step2设置的url，得到所有评论，抽取评论
        xparser = XPathUtility(params.content)
        commentsinfo = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="commli"]/p')
        commentstime = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="published-time"]')
        commentsnick = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="user-name"]')
        # 获取评论，设置实际的评论量
        for index in range(0, len(commentstime), 1):
            # 提取时间
            tm = commentstime[index].strip()
            try:
                curtime = TimeUtility.getuniformtime(getuniformtime(tm),
                                                     u'%Y-%m-%d %H:%M')
            except Exception, e:
                curtime = getuniformtime(tm)

            # 提取评论内容
            content = commentsinfo[index]
            nick = commentsnick[index]
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)

コード例 #2

0

ファイルを表示

    def step3(self, params):
        Logger.getlogging().info("Flash8Comments.STEP_3")
        # Step3: 通过Step2设置的url，得到所有评论，抽取评论
        page = params.customized['page']
        xparser = XPathUtility(params.content)
        commentsinfo = xparser.getcomments('//td[@class="t_f"]')
        #commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>', params.content)
        commentstime = xparser.getcomments('//div[@class="authi"]/em')

        comments = []

        # 获取评论
        # 设置实际的评论量
        if page is 1:
            statrIndex = 1
        else:
            statrIndex = 0
        for index in range(statrIndex, len(commentstime), 1):
            cmti = CommentInfo()
            if URLStorage.storeupdatetime(params.originalurl,
                                          commentstime[index]):
                # 获取增加的评论（根据时间比较）
                cmti.content = commentsinfo[index]
                comments.append(cmti)
        # 保存获取到的评论
        if len(comments) > 0:
            self.commentstorage.store(params.originalurl, comments)

コード例 #3

0

ファイルを表示

ファイル: hupuquery.py プロジェクト: ErBingBing/django-tonado-crawler

    def step1(self, params):
        # 获得首页url参数
        info = params.customized['query']
        xparser = XPathUtility(params.content)
        if not xparser.xpath('//*[@class="mytopic topiclisttr"]'):
            Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS)
            return
        pageList = xparser.getcomments('//span[@class="right"]/a')
        if len(pageList) == 1:
            pageTotal = 1
        else:
            pageTotal = pageList[len(pageList) - 2]

        if int(pageTotal) >= self.maxpages:
            pageTotal = self.maxpages

        # 所有循环列表
        querylist = []

        # 根据总页数，获取query列表
        for page in range(1, int(pageTotal) + 1, 1):
            if page == 1:
                self.step2(params)
                continue
            url = hupuS2Query.HUPU_QUERY_TEMPLATE.format(q=info, pn=page)
            querylist.append(url)
        self.__storeqeuryurllist__(querylist,
                                   hupuS2Query.HUPU_S2QUERY_EACH_PAGE,
                                   {'query': info})

コード例 #4

0

ファイルを表示

ファイル: jjwxcbbscomments.py プロジェクト: ErBingBing/django-tonado-crawler

 def bbs_step3(self, params):
     try:
         xparser = XPathUtility(params.content)
         page = params.customized['page']
         pagecount = params.customized['pagecount']
         comments = []
         updatetimes = []
         nicks = []
         contents = xparser.getcomments('//*[@class="read"]')
         mid_times = xparser.getlist('//td[@class="authorname"]')
         for times in mid_times:
             updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0])
             nicks.append(self.r.parse(ur'(.*)于', times)[0])
         if page == 0:
             mid_index = 1
         elif page > 0:
             mid_index = 0
         comments_number = xparser.getnumber('//*[@id="msgsubject"]/font')
         if comments_number != 0:
             for index in range(mid_index, len(contents), 1):
                 curtime = TimeUtility.getuniformtime(updatetimes[index])
                 content = contents[index]
                 nick = nicks[index].split('于')[0].split('☆')[-1]
                 if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content, curtime, nick)
     except Exception, e:
         traceback.print_exc()

コード例 #5

0

ファイルを表示

    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is Dm123BbsComments.STEP_1:
                xparser = XPathUtility(params.content)
                #通过第一次传进来的URL判断是否有后续页面
                keyvalue = self.r.parse('tid-(.*?).html', params.url)[0]
                pagecount = xparser.getnumber(
                    '//*[@class="pages"]/div[@class="fl"]')
                commentinfo_url = params.url
                self.storeurl(commentinfo_url, params.originalurl,
                              Dm123BbsComments.STEP_2, {
                                  'keyvalue': keyvalue,
                                  'totalpage': pagecount,
                                  'curpage': 1
                              })

            elif params.step == Dm123BbsComments.STEP_2:
                keyvalue = params.customized['keyvalue']
                curpage = params.customized['curpage']
                xparser = XPathUtility(params.content)
                commentsinfo = xparser.getcomments(
                    '//div[contains(@class,"tpc_content")]')
                commentstime = self.r.parse(ur'\"(\d+-\d+-\d+ \d+:\d+)\">发表于:',
                                            params.content)
                comments = []
                for index in range(0, len(commentstime)):
                    cmti = CommentInfo()
                    if URLStorage.storeupdatetime(
                            params.originalurl,
                            TimeUtility.getuniformtime(commentstime[0] +
                                                       ':00')):
                        # 获取增加的评论（根据时间比较）
                        cmti.content = commentsinfo[index]
                        comments.append(cmti)
                if len(comments) > 0:
                    self.commentstorage.store(params.originalurl, comments)
                nextpageList = [keyvalue, "-page-", str(curpage + 1)]
                nextpage = ''
                nextpage = nextpage.join(nextpageList)
                if int(nextpageList[2]) <= int(params.customized['totalpage']):
                    comment_url = Dm123BbsComments.COMMENT_URL.format(
                        page=nextpage)
                    self.storeurl(
                        comment_url, params.originalurl,
                        Dm123BbsComments.STEP_2, {
                            'keyvalue': nextpageList[0],
                            'totalpage': params.customized['totalpage'],
                            'curpage': curpage + 1
                        })

        except Exception, e:
            traceback.print_exc()

コード例 #6

0

ファイルを表示

ファイル: zolbbsComments.py プロジェクト: ErBingBing/django-tonado-crawler

    def geturlcomments(self, params):
        xparser = XPathUtility(params.content)
        # 取回所有评论
        page = params.customized['page']
        if page == 1:
            commentstimes = xparser.getcomments(
                '//table[position()>1]/tbody/tr/td/span[1]')
            commentscontents = xparser.getcomments(
                '//table[position()>1]/tbody/tr[2]/td[@class="post-main"]')
            commentsnicks = xparser.getcomments('//*[@class="name"]/a')
        else:
            commentstimes = xparser.getcomments('//table/tbody/tr/td/span[1]')
            commentscontents = xparser.getcomments(
                '//table/tbody/tr[2]/td[@class="post-main"]')
            commentsnicks = xparser.getcomments('//*[@class="name"]/a')

        # 设置实际的评论量
        for index in range(0, len(commentscontents), 1):
            curtime = TimeUtility.getuniformtime(commentstimes[index][4:])
            # 提取评论内容
            content = commentscontents[index].strip()
            nick = commentsnicks[index].strip()
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)

コード例 #7

0

ファイルを表示

ファイル: dm123NewsComments.py プロジェクト: ErBingBing/django-tonado-crawler

    def step3(self, params):
        Logger.getlogging().info("Dm123NewsComments.STEP_3")
        # Step3: 通过Step2设置的url，得到所有评论，抽取评论
        is_only_one_page = params.customized['is_only_one_page']
        if is_only_one_page:
            commentsinfos = params.customized['commentsinfos']
            commentstimes = params.customized['commentstimes']
        else:
            xparser = XPathUtility(params.content)
            commentsinfos = xparser.getcomments('//div[@class="rbvalueout"]')
            commentstimes = xparser.getcomments('//span[@class="rbtime"]')

        comments = []
        for index in range(0, len(commentstimes)):
            commentstime = commentstimes[index].strip()
            if URLStorage.storeupdatetime(params.originalurl, commentstime):
                cmti = CommentInfo()
                cmti.content = commentsinfos[index].strip()
                comments.append(cmti)

        # 保存获取的评论
        if len(comments) > 0:
            self.commentstorage.store(params.originalurl, comments)

コード例 #8

0

ファイルを表示

ファイル: dm123NewsComments.py プロジェクト: ErBingBing/django-tonado-crawler

    def step2(self, params):
        Logger.getlogging().info("Dm123NewsComments.STEP_2")
        classid = params.customized['classid']
        id = params.customized['id']
        xparser = XPathUtility(params.content)
        # 评论总数(当评论不满一页时，直接获取到的comments_count为0)
        comments_count = xparser.getnumber('//div/a[1]/b')

        # comments_count为0时分两种情况，真的没有评论和有评论
        if 0 == comments_count:
            commentsinfos = xparser.getcomments('//div[@class="rbvalueout"]')
            commentstimes = xparser.getcomments('//span[@class="rbtime"]')
            # comments_count重新赋值
            comments_count = len(commentsinfos)
            if 0 == comments_count:
                return
            else:
                # 判断增量
                cmtnum = URLStorage.getcmtnum(params.originalurl)
                if cmtnum >= comments_count:
                    return
                URLStorage.setcmtnum(params.originalurl, comments_count)
                self.storeurl(params.originalurl, params.originalurl, Dm123NewsComments.STEP_3,
                              {'is_only_one_page': True, 'commentsinfos': commentsinfos,
                               'commentstimes': commentstimes})
        else:
            # 判断增量
            cmtnum = URLStorage.getcmtnum(params.originalurl)
            if cmtnum >= comments_count:
                return
            URLStorage.setcmtnum(params.originalurl, comments_count)
            # 评论页数
            page_count = int(math.ceil(float(comments_count) / self.page_size))
            for page in range(0, int(page_count), 1):
                comment_url = Dm123NewsComments.COMMENT_URL.format(page=page, classid=classid, id=id)
                self.storeurl(comment_url, params.originalurl, Dm123NewsComments.STEP_3,
                              {'is_only_one_page': False})

コード例 #9

0

ファイルを表示

ファイル: hupuComments.py プロジェクト: ErBingBing/django-tonado-crawler

    def step1(self, params):
        """"""
        #Step1: 通过得到docurl，得到获取评论的url的参数。
        #docurl = self.r.parse('^http://bbs\.hupu\.com\/(\d+)', params.originalurl)
        docurl = self.r.parse('^http[s]{0,1}://bbs\.hupu\.com\/(\d+)',
                              params.originalurl)
        if docurl:
            docurl = docurl[0]
        else:
            Logger.getlogging().debug(
                '{url}:20000'.format(url=params.originalurl))
            return
        # 取得正文
        xparser = XPathUtility(params.content)
        #取得页数
        pageList = xparser.getcomments('//div[@class="page"]/a')
        if not pageList:
            pagenum = 1
        elif pageList:
            pagenum = pageList[-2]
        else:
            return
        if int(pagenum) >= self.maxpages:
            pagenum = self.maxpages
        # 评论总数
        curcmtnum = xparser.getnumber('//span[@class="browse"]')
        NewsStorage.setcmtnum(params.originalurl, curcmtnum)
        dbcmtnum = CMTStorage.getcount(params.originalurl, True)
        if dbcmtnum >= curcmtnum:
            return

        start = int(dbcmtnum / self.page_size) + 1
        end = int(pagenum)
        if end > start + self.maxpages:
            start = end - self.maxpages

        params.customized['page'] = 1
        if end == 1:
            self.step2(params)
            return
        if start == 1:
            self.step2(params)
        comment_url = self.COMMENT_URL.format(docurl=docurl, page=end)
        self.storeurl(comment_url, params.originalurl, hupuComments.STEP_1_2, {
            'docurl': docurl,
            'page': end,
            'start': start,
            'end': end
        })

コード例 #10

0

ファイルを表示

ファイル: ea3wcomments.py プロジェクト: ErBingBing/django-tonado-crawler

    def step3bbs(self, params):
        Logger.getlogging().info("Ea3wcomments.STEP_3")
        # Step3: 通过Step2设置的url，得到所有评论，抽取评论
        xparser = XPathUtility(params.content)
        commentsinfo = xparser.getcomments('//p[@class="comment-content"]')
        commentstime = xparser.getcomments('//span[@class="time"]')
        comments = []

        # 获取评论
        for index in range(0, int(len(commentsinfo)), 1):
            # 提取时间
            cmti = CommentInfo()
            cmti.content = commentsinfo[index]

            if str(commentstime[index]).strip().decode("utf8") == '刚刚'.decode(
                    "utf8"):
                tm = getuniformtime(str(datetime.datetime.now()))
            else:
                tm = getuniformtime(str(commentstime[index]))
            if URLStorage.storeupdatetime(params.originalurl, tm):
                comments.append(cmti)
        # 保存获取到的评论
        if len(comments) > 0:
            self.commentstorage.store(params.originalurl, comments)

コード例 #11

0

ファイルを表示

ファイル: bookComments.py プロジェクト: ErBingBing/django-tonado-crawler

    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is bookComments.STEP_1:
                # 取得url中的id
                articleId = self.r.parse(
                    r'^http://www\.2200book\.com/files/article/\w+/\d+/(\d+)\.htm$',
                    proparam.originalurl).__getitem__(0)
                # 取得评论首页
                url = bookComments.COMMENTS_URL % (articleId, 1)
                self.storeurl(url, proparam.originalurl, bookComments.STEP_2,
                              {'articleId': articleId})

            elif proparam.step == bookComments.STEP_2:
                articleId = proparam.customized['articleId']
                # 取得评论页数
                xparser = XPathUtility(proparam.content)
                page_count = int(
                    self.r.parse(
                        ur'>>(\d+)',
                        xparser.getcomments("//*[@id='pagelink']")[0])[0])

                # 取得评论的页数
                if int(page_count) == 0:
                    return

                # 取得评论的url
                for page in range(1, int(page_count) + 1, 1):
                    url = bookComments.COMMENTS_URL % (articleId, page)
                    self.storeurl(url, proparam.originalurl,
                                  bookComments.STEP_3)

            elif proparam.step == bookComments.STEP_3:
                rids = re.findall(r'rid=(\d+)">', proparam.content)
                for rid in rids:
                    url = bookComments.COMMENTS_URL_RID % (rid)
                    self.storeurl(url, proparam.originalurl,
                                  bookComments.STEP_4)

            elif proparam.step == bookComments.STEP_4:
                commentsInfo = []
                # 论坛
                xparser = XPathUtility(proparam.content)
                comments = xparser.getcomments(
                    '//*[@id="sp_2"]/p[2]|//*[@id="b_v_5"]')
                commentTimes = self.r.parse(ur'发表于(:| )?(.+)(</p>|</div>)',
                                            proparam.content)

                for index in range(0, int(len(comments)), 1):
                    if URLStorage.storeupdatetime(proparam.originalurl,
                                                  commentTimes[index][1]):
                        cmti = CommentInfo()
                        cmti.content = comments[index]
                        commentsInfo.append(cmti)

                # 保存获取的评论
                if len(commentsInfo) > 0:
                    self.commentstorage.store(proparam.originalurl,
                                              commentsInfo)

            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)

        except Exception, e:
            traceback.print_exc()

コード例 #12

0

ファイルを表示

ファイル: seventeenKComments.py プロジェクト: ErBingBing/django-tonado-crawler

    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is SeventeenKComments.STEP_1:
                #Step1: 通过得到docurl，得到获取评论的首页url。
                #Logger.getlogging().info("proparam.step is None")
                # 在视频url中取出docurl，^http://v\.ifeng\.com\/\w+\/\w+/\d{6}\/[0-9a-z-]+\.shtml
                # 取URL中的([0-9a-z-]+)参数，此参数为docurl
                docurl = self.r.parse(
                    '^http://bbs\.17k\.com\/thread-(\d+)-\d+-1\.html',
                    params.originalurl)[0]
                #Logger.getlogging().debug(docurl)
                # 评论首页URL为http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=1
                commentinfo_url = 'http://bbs.17k.com/thread-{docurl}-1-1.html'.format(
                    docurl=docurl)
                self.storeurl(commentinfo_url, params.originalurl,
                              SeventeenKComments.STEP_2, {'docurl': docurl})

            elif params.step == SeventeenKComments.STEP_2:
                #将STEP_1中的docurl传下来
                docurl = params.customized['docurl']
                # Step2: 通过Step1设置url，得到评论的总数，并根据评论总数得到获取其他评论的url。
                #Logger.getlogging().info("params.step == 2")
                # 打开STEP1中URL，截取"count"：num字段，取出num的值，num字段为评论总数
                xparser = XPathUtility(params.content)
                commentsinfo = xparser.getnumber(
                    '//*[@class="hm ptn"]/span[5]')

                #Logger.getlogging().debug(comments_count / self.page_size)
                #Logger.getlogging().debug(math.ceil(comments_count / self.page_size))

                # 保存页面评论量
                cmtnum = URLStorage.getcmtnum(params.originalurl)
                if cmtnum >= int(commentsinfo):
                    return
                URLStorage.setcmtnum(params.originalurl, int(commentsinfo))

                # 总数除以page_size，然后加1，可得到评论总页数comments_count
                # 循环http://comment.ifeng.com/getv.php?job=1&docurl=([0-9a-z-]+)&p=comments_count，从一开始循环到上一步操作取到的数值，从而得到所有评论的URL,并保存
                pagecount = xparser.getnumber('//*[@class="pg"]/label/span')

                for page in range(1, pagecount + 1, 1):
                    comment_url = SeventeenKComments.COMMENT_URL.format(
                        docurl=docurl, page=page)
                    self.storeurl(comment_url, params.originalurl,
                                  SeventeenKComments.STEP_3, {'page': page})

            elif params.step == SeventeenKComments.STEP_3:
                # Step3: 通过Step2设置的url，得到所有评论，抽取评论
                #Logger.getlogging().info("params.step == 3")
                page = params.customized['page']
                xparser = XPathUtility(params.content)
                commentsinfo = xparser.getcomments(
                    '//*[contains(@id,"postmessage")]')
                commentstime = self.r.parse(ur'发表于 (\d+-\d+-\d+ \d+:\d+)</em>',
                                            params.content)
                comments = []

                #获取评论
                # 设置实际的评论量
                if page is 1:
                    statrIndex = 1
                else:
                    statrIndex = 0
                for index in range(statrIndex, len(commentstime), 1):
                    cmti = CommentInfo()
                    if URLStorage.storeupdatetime(params.originalurl,
                                                  commentstime[index] + ':00'):
                        # 获取增加的评论（根据时间比较）
                        cmti.content = commentsinfo[index]
                        comments.append(cmti)
                # 保存获取到的评论
                if len(comments) > 0:
                    self.commentstorage.store(params.originalurl, comments)

            else:
                Logger.getlogging().error(
                    'proparam.step == {step}'.format(step=params.step))

        except Exception, e:
            traceback.print_exc()

コード例 #13

0

ファイルを表示

ファイル: ishangmanComments.py プロジェクト: ErBingBing/django-tonado-crawler

    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is ishangmanComments.STEP_1:
                # 取得url中的参数值
                articleIds = re.findall(
                    r'^http://(\w+)\.ishangman\.com/\w+/(\d+)',
                    proparam.url).__getitem__(0)
                articleId1 = articleIds.__getitem__(0)
                articleId2 = articleIds.__getitem__(1)
                # 评论类型
                commenttype = int(
                    self.r.parse(ur'commenttype = (.*);', proparam.content)[0])
                #第一页评论
                url = ishangmanComments.COMMENTS_URL % (articleId1, articleId2,
                                                        commenttype, 1)
                self.storeurl(
                    url, proparam.originalurl, ishangmanComments.STEP_2, {
                        'articleId1': articleId1,
                        'articleId2': articleId2,
                        'commenttype': commenttype
                    })

            elif proparam.step == ishangmanComments.STEP_2:
                articleId1 = proparam.customized['articleId1']
                articleId2 = proparam.customized['articleId2']
                commenttype = proparam.customized['commenttype']
                # 取得评论件数
                xhtml = XPathUtility(html=proparam.content)
                if articleId1.__eq__('comic'):
                    comments_count = int(
                        xhtml.getlist(
                            '//*[contains(@class,"ismcartondiv1")]/p/strong')
                        [0])
                    if comments_count:
                        NewsStorage.setcmtnum(proparam.originalurl,
                                              comments_count)
                else:
                    comments_count = int(
                        self.r.parse(
                            ur'(\d+).*',
                            xhtml.getlist('//*[@class="comment_lctwidl"]/p')
                            [0])[0])
                    if comments_count:
                        NewsStorage.setcmtnum(proparam.originalurl,
                                              comments_count)
                # 取得评论的页数
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if int(comments_count) == 0:
                    return
                page_num = int(
                    math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages

                # 取得评论的url
                for page in range(1, page_num + 1, 1):
                    url = ishangmanComments.COMMENTS_URL % (
                        articleId1, articleId2, commenttype, page)
                    self.storeurl(url, proparam.originalurl,
                                  ishangmanComments.STEP_3,
                                  {'articleId1': articleId1})

            elif proparam.step == ishangmanComments.STEP_3:
                try:
                    Logger.getlogging().debug(proparam.originalurl)
                    commentsInfo = []
                    articleId1 = proparam.customized['articleId1']
                    xparser = XPathUtility(proparam.content)
                    # 取得评论件数
                    if articleId1.__eq__('comic'):
                        # 论坛评论
                        soup = BeautifulSoup(proparam.content, 'html5lib')
                        comments = soup.select('.ismcartondiv2')
                    else:
                        # 论坛评论
                        comments = xparser.getcomments(
                            '/html/body/div/span[2]/p[1]')
                        # 论坛评论时间
                        updateTime = xparser.getcomments(
                            '/html/body/div/span[2]/div[1]')

                    # 取得评论
                    for index in range(0, int(len(comments)), 1):
                        cmti = []
                        if articleId1.__eq__('comic'):
                            publictime = self.r.parse(
                                ur'(\d{2}-\d+ \d+:\d+)',
                                comments[index].get_text())[0]
                            # publictime  = TimeUtility.getuniformtime(publictime)
                            if publictime:
                                cmt_month = publictime.split("-")[0]
                                curmonth = time.localtime().tm_mon
                                if (int(cmt_month) < curmonth):
                                    publictime = TimeUtility.getcurrentdate(
                                    )[0:4] + '-' + publictime
                                else:
                                    publictime = '2016' + '-' + publictime
                            curtime = TimeUtility.getuniformtime(publictime)
                            content = comments[index].text.split(
                                '\n')[0].get_text()

                            # # print comments;
                            # return
                            # content = self.r.parse(ur'class=\".*\"',comments[index].get_text())[0]
                            # nick = comments[1].get('nickname', 'anonymous')
                            #
                            # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
                            #     CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
                            # if NewsStorage.storeupdatetime(proparam.originalurl, tm):
                            #     cmti.content = comments[index].get_text()
                            #     commentsInfo.append(cmti)
                        else:
                            publictime = updateTime[index][:-8]
                            #publictime = TimeUtility.getcurrentdate()[0:4] + '-'+ publictime
                            #tm = TimeUtility.getuniformtime(publictime, u'%Y-%m-%d %H:%M')
                            tm = getuniformtime(publictime)
                            if NewsStorage.storeupdatetime(
                                    proparam.originalurl, tm):
                                cmti.content = comments[index]
                                commentsInfo.append(cmti)

                        # 保存获取的评论i
                    if len(commentsInfo) > 0:
                        self.commentstorage.store(proparam.originalurl,
                                                  commentsInfo)

                except:
                    Logger.printexception()
                    Logger.getlogging().error(
                        'extract comment error from {site}'.format(
                            site=proparam.url))
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)

        except Exception, e:
            traceback.print_exc()

コード例 #14

0

ファイルを表示

    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is xinhuaBbsComments.STEP_1:
                # 取得url中的id
                articleId = re.findall(
                    r'^http://forum\.home\.news\.cn/\w+/(\d+)/\d+\.html',
                    proparam.originalurl).__getitem__(0)
                # 取得评论的url列表
                comments_url = xinhuaBbsComments.COMMENTS_URL % (articleId, 1)
                self.storeurl(comments_url, proparam.originalurl,
                              xinhuaBbsComments.STEP_2,
                              {'articleId': articleId})

            elif proparam.step == xinhuaBbsComments.STEP_2:
                articleId = proparam.customized['articleId']
                xparser = XPathUtility(proparam.content)
                pages = xparser.getcomments(
                    '//*[@id="postreply"]/div[2]/ul[1]/li/a')
                comments = xparser.getcomments(
                    '//*[@id="postreply"]/dl/dd/div/p[2]')
                comments_count = len(comments)
                # 如果页数为0
                if len(pages) == 0 and comments_count != 0:
                    url = xinhuaBbsComments.COMMENTS_URL % (articleId, 1)
                    self.storeurl(url, proparam.originalurl,
                                  xinhuaBbsComments.STEP_3)

                # 增量判断
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if cmtnum >= comments_count:
                    return

                page_num = int(len(pages))
                # 判断页数
                if page_num >= self.maxpages:
                    page_num = self.maxpages

                start = int(cmtnum / self.PAGE_SIZE) + 1
                end = int(page_num)
                if end > start + self.maxpages:
                    start = end - self.maxpages

                # 循环取得评论的url
                for page_num in range(end, start - 1, -1):
                    # 取得评论的url
                    url = xinhuaBbsComments.COMMENTS_URL % (articleId,
                                                            page_num)
                    self.storeurl(url, proparam.originalurl,
                                  xinhuaBbsComments.STEP_3)

            elif proparam.step == xinhuaBbsComments.STEP_3:
                # 取得评论
                xparser = XPathUtility(proparam.content)
                comments = xparser.getcomments(
                    '//*[@id="postreply"]/dl/dd/div/p[2]')
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                comments_count = len(comments)
                NewsStorage.setcmtnum(proparam.originalurl,
                                      comments_count + cmtnum)
                # 评论存在的场合
                if len(comments) != 0:
                    # 取得发布时间
                    publicTimes = re.findall(
                        ur'<li><span id="time_\d+">(\d+-\d+-\d+ \d+:\d+:\d+)发表</span></li>',
                        proparam.content)
                    publicIndex = 0
                    nicks = xparser.getcomments(
                        '//*[@id="postreply"]/dl/dd/ul[1]/li[1][a]')
                    for comment in comments:
                        content = comment
                        publictime = publicTimes[publicIndex]
                        curtime = TimeUtility.getuniformtime(publictime)
                        nick = nicks[publicIndex]

                        if not CMTStorage.exist(proparam.originalurl, content,
                                                curtime, nick):
                            CMTStorage.storecmt(proparam.originalurl, content,
                                                curtime, nick)
                        publicIndex = publicIndex + 1

            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)

        except Exception, e:
            traceback.print_exc()

コード例 #15

0

ファイルを表示

ファイル: xie17NewsComments.py プロジェクト: ErBingBing/django-tonado-crawler

    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is Xie17NewsComments.STEP_1:
                #Step1: 通过得到docurl，得到获取评论的首页url参数。
                articleId = self.r.parse('^http://xiaoshuo\.17xie\.com/book/(\d+)/', params.originalurl)[0]

                # 取得评论的url列表
                comments_url = Xie17NewsComments.COMMENT_URL % (articleId, 1)
                self.storeurl(comments_url, params.originalurl, Xie17NewsComments.STEP_2, {'articleId': articleId})

            elif params.step == Xie17NewsComments.STEP_2:
                # 获得评论参数
                articleId = params.customized['articleId']

                # 取得总件数
                comment_count = float(self.r.parse(ur'共(\d+)人说过', params.content)[0])
                if comment_count == 0:
                    return

                # 判断增量
                cmtnum = URLStorage.getcmtnum(params.originalurl)
                if cmtnum >= comment_count:
                    return
                URLStorage.setcmtnum(params.originalurl, comment_count)

                # 获取页数
                page = int(math.ceil(comment_count / Xie17NewsComments.PAGE_SIZE))

                # 获得url列表
                for page in range(1, page + 1, 1):
                    url = Xie17NewsComments.COMMENT_URL % (articleId, page)
                    self.storeurl(url, params.originalurl, Xie17NewsComments.STEP_3)

            elif params.step == Xie17NewsComments.STEP_3:
                # Step3: 通过Step2设置的url，得到所有评论，抽取评论
                Logger.getlogging().info("params.step == 3")
                xparser = XPathUtility(params.content)
                # 取得所有评论
                comments = xparser.getcomments('/html/body/ul/li[2]/dl/dd')
                # 取得所有评论时间
                commenttimes = xparser.xpath('/html/body/ul/li[2]/dl/dt/text()')

                commentsInfo = []
                # 取得所有评论
                for index in range(0, int(len(commenttimes)), 1):
                    # 提取时间
                    if self.r.search(ur'\d+年\d+月',commenttimes[index].strip()):
                        tm = TimeUtility.getuniformtime(str(commenttimes[index]).strip(), '%Y年%m月')
                    else:
                        tm = getuniformtime(commenttimes[index].strip())

                    if URLStorage.storeupdatetime(params.originalurl, tm):
                        cmti = CommentInfo()
                        comment = comments[index * 3] + comments[index * 3 + 1] + comments[index * 3 + 2]
                        cmti.content = comment
                        commentsInfo.append(cmti)

                    # 保存获取的评论
                if len(commentsInfo) > 0:
                    self.commentstorage.store(params.originalurl, commentsInfo)
            else: