Python XPathUtility.getlist Examples, utility.xpathutil.XPathUtility.getlist Python Examples

Example #1

0

Show file

    def getcomments_step3(self, params):
        xhtml = XPathUtility(html=params.content)

        contents = xhtml.getlist('//*[contains(@id,"partThreadContent")]')
        curtimes = xhtml.getlist('//*[@class="comment_rw"]/span/em')
        nicks = xhtml.getlist('//*[@class="wzbox"]/h5')
        for index in range(0, len(contents), 1):
            curtime = TimeUtility.getuniformtime(curtimes[index]+':00')
            content = str(contents[index])
            nick = str(nicks[index])
            if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)

Example #2

0

Show file

File: zonghengPubComments.py Project: ErBingBing/django-tonado-crawler

    def getcomments_step3(self, params):
        xhtml = XPathUtility(html=params.content)
        contents = xhtml.getlist('//*[@class="wz"]/p')
        curtimes = xhtml.getlist('//*[@class="fr"]')
        nicks = xhtml.getlist('//*[@class="wzbox"]/h5')

        for index in range(0, len(contents), 1):
            curtime = curtimes[index][4:] + ':00'
            Logger.getlogging().debug(contents[index])
            content = str(contents[index])
            nick = str(nicks[index])
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)

Example #3

0

Show file

File: laohucomments.py Project: ErBingBing/django-tonado-crawler

    def getComments(self, params):
        xhtml = XPathUtility(html=params.content)
        commentinfo = xhtml.getlist('//*[@class="recTxt"]')
        updatetimes = xhtml.getlist('//*[@class="comment-time"]')
        comments = []
        for index in range(0, commentinfo.__len__()):
            curtime = TimeUtility.getuniformtime(updatetimes[index][1:updatetimes[index].__len__() - 1])
            content = commentinfo[index]
            nick = 'nick'
            if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)

        if len(comments) > 0:
            self.commentstorage.store(params.originalurl, comments)

Example #4

0

Show file

    def process(self, params):
        try:
            if params.step is None:
                # 根据html内容获取评论总数
                xhtml = XPathUtility(html=params.content)
                countsStr = str(
                    xhtml.getstring('//*[@id="chartForm"]/div[1]/a[3]'))
                startpos = countsStr.find('(')
                if startpos < 0:
                    Logger.getlogging().error(params.originalurl)
                    return
                comment_counts = int(countsStr[startpos +
                                               1:countsStr.find(')')])
                Logger.getlogging().debug(comment_counts)
                if comment_counts == 0:
                    return
                # 比较上次抓取该url的页面评论量和当前取到的评论量
                #
                # 循环拼接评论url，提交下载平台获取评论数据
                for page in range(
                        1,
                        int(
                            math.ceil(comment_counts /
                                      Cine107Comments.PAGE_SIZE)) + 1, 1):
                    commentUrl = Cine107Comments.COMMENTS_URL.format(
                        url=params.originalurl, pageno=page)
                    Logger.getlogging().debug(commentUrl)
                    self.storeurl(commentUrl, params.originalurl,
                                  Cine107Comments.STEP_2)
                URLStorage.setcmtnum(params.originalurl, comment_counts)

            #解析评论数据
            elif params.step == Cine107Comments.STEP_2:
                xhtml = XPathUtility(html=params.content)
                comments = []
                contents = xhtml.getlist(
                    '//*[@class="flow_commont_list clearfix"]/p')
                updatetimes = xhtml.getlist('//*/time')
                for index in range(0, len(contents), 1):
                    udpatetime = TimeUtility.getuniformtime(updatetimes[index])
                    if URLStorage.storeupdatetime(params.originalurl,
                                                  udpatetime):
                        cmti = CommentInfo()
                        Logger.getlogging().debug(contents[index])
                        cmti.content = str(contents[index])
                        comments.append(cmti)
                    if len(comments) > 0:
                        self.commentstorage.store(params.originalurl, comments)
        except:
            Logger.printexception()

Example #5

0

Show file

    def getpagecomments(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href')
        titles = xpath.getlist('//*[@class="sosResult"]/strong/a')
        pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformtime(
                    pubtimes[index].text).split(' ')[0]
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT)
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= int(self.querylastdays):
                    newurl = self.preprocess(hrefs[index])
                    if newurl is not None:
                        urllist.append(newurl)

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)

Example #6

0

Show file

File: jjwxcbbscomments.py Project: ErBingBing/django-tonado-crawler

 def bbs_step3(self, params):
     try:
         xparser = XPathUtility(params.content)
         page = params.customized['page']
         pagecount = params.customized['pagecount']
         comments = []
         updatetimes = []
         nicks = []
         contents = xparser.getcomments('//*[@class="read"]')
         mid_times = xparser.getlist('//td[@class="authorname"]')
         for times in mid_times:
             updatetimes.append(self.r.parse(ur'于(\d+-\d+-\d+ \d+:\d+:\d+)留言', times)[0])
             nicks.append(self.r.parse(ur'(.*)于', times)[0])
         if page == 0:
             mid_index = 1
         elif page > 0:
             mid_index = 0
         comments_number = xparser.getnumber('//*[@id="msgsubject"]/font')
         if comments_number != 0:
             for index in range(mid_index, len(contents), 1):
                 curtime = TimeUtility.getuniformtime(updatetimes[index])
                 content = contents[index]
                 nick = nicks[index].split('于')[0].split('☆')[-1]
                 if not CMTStorage.exist(params.originalurl, content, curtime, nick):
                     CMTStorage.storecmt(params.originalurl, content, curtime, nick)
     except Exception, e:
         traceback.print_exc()

Example #7

0

Show file

File: laohucomments.py Project: ErBingBing/django-tonado-crawler

 def step2(self, params):
     Logger.getlogging().info("LaohuComments.STEP_2")
     token = params.customized['token']
     sourceId = params.customized['sourceId']
     xhtml = XPathUtility(html=params.content)
     # 网友评论(32)
     countstr = xhtml.getlist('//*[@class="filter-by-type"]')[0]
     comment_counts = int(countstr[5:countstr.__len__()-1])
     if comment_counts:
         NewsStorage.setcmtnum(params.originalurl, comment_counts)
     if comment_counts == 0:
         Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl))
         return
     cmtnum = CMTStorage.getcount(params.originalurl, True)
     # 判断增量
     if cmtnum >= comment_counts:
         #Logger.getlogging().warning('{url}:30000 No comments'.format(url=params.originalurl))
         return
     page_num = int(math.ceil(float(comment_counts - cmtnum) / self.PAGE_SIZE))
     if page_num >= self.maxpages:
         page_num = self.maxpages
     # 获取第一页评论内容
     self.getComments(params)
     if comment_counts > 15:
         # 循环拼接评论url，提交下载平台获取评论数据
         COMMENTS_URL = 'http://member.laohu.com/comment/ajax?page=%d&token=%s&order=new'
         for page in range(2, page_num + 1, 1):
             commentUrl = LaohuComments.COMMENTS_URL % (page, sourceId)
             self.storeurl(commentUrl, params.originalurl, LaohuComments.STEP_3, {'token' : token, 'sourceId':sourceId})

Example #8

0

Show file

File: bookcomments.py Project: ErBingBing/django-tonado-crawler

    def geturlcomments(self, params):
        # 获取具体评论
        xparser = XPathUtility(params.content)
        comments_xpath = xparser.xpath('//*[@id="short_comment_content"]')
        if not comments_xpath:
            return

        # 获取发布时间
        ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]')

        if len(comments_xpath) == len(ip_pubtimes_xpath):
            comments = []
            # 获取评论
            for index in range(0, len(comments_xpath), 1):
                cmti = CommentInfo()
                publicTime = ip_pubtimes_xpath[index]
                if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime):
                    publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+',
                                                     publicTime)[0]

                if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime):
                    publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+',
                                              publicTime)[0]

                if URLStorage.storeupdatetime(params.originalurl,
                                              getuniformtime(publicTime)):
                    # 获取增加的评论（根据时间比较）
                    cmti.content = comments_xpath[index].text
                    comments.append(cmti)

Example #9

0

Show file

    def process(self, params):
        # 从搜索首页面中获取的搜索结果数量，生成搜索页面URL
        if params.step == LaohuS2Query.LAOHU_S2QUERY_FIRST_PAGE:
            # 获得首页url参数
            KEY = params.customized['KEY']
            time = params.customized['time']
            #获取总页数
            xparser = XPathUtility(params.content)
            pageCounts = xparser.getlist('//*[@id="main"]/div[2]/span')
            if len(pageCounts) > 0:
                page = str(pageCounts[0]).split('/')[1]

                #获取第一页的搜索结果
                self.pageprocess(params)

                if int(page) > 1:
                    if int(page) >= self.maxpages:
                        page = self.maxpages
                    querylist = []
                    # 根据总页数，获取query列表(第一页的数据已经获取到了，从第二页开始拼出获取的url)
                    for pages in range(2, int(page) + 1, 1):
                        url = LaohuS2Query.LAOHU_QUERY_TEMPLATE.format(
                            KEY=KEY, pn=pages, time=time)
                        querylist.append(url)
                    self.__storeqeuryurllist__(
                        querylist, LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE,
                        {'KEY': KEY})

            else:
                Logger.getlogging().debug('抱歉，没有找到与' + ' ' + KEY + ' ' +
                                          '相关的帖子')

        # 从查询页面中获取视频URL
        elif params.step == LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE:
            self.pageprocess(params)

Example #10

0

Show file

File: one63query.py Project: ErBingBing/django-tonado-crawler

    def getsearchresult(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//li/h3/a/@href')
        titles = xpath.getlist('//li/h3/a')
        pubtimes = xpath.xpath('//li/p')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text)
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= self.querylastdays:
                    urllist.append(hrefs[index])
                else:
                    # 因为是按照时间排序的，第一条时间不满足检索周期的话，后面所有的都不满足。
                    break

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

Example #11

0

Show file

    def pageprocess(self, params):
        # 获取文本
        xparser = XPathUtility(params.content)
        # 获取该页超级链接
        hreflist = xparser.xpath('//h3/a/@href')
        hrefs = []
        for mid_url in hreflist:
            mid = self.preprocess(mid_url)
            if mid is not None:
                hrefs.append(mid)

        # 获取该页内容的所有发布时间
        publictime = xparser.xpath('//*[@class="scontent"]/text()[1]')
        publicTimes = []
        for timeindex in publictime:
            middle = str(timeindex).replace('\n', '').replace('\t', '').strip()
            publicTimes.append(
                str(str(middle).split(' ')[0]) + ' ' +
                str(str(middle).split(' ')[1]))
        # 获取改页所有title
        titles = []
        titles_list = xparser.getlist('//h3')
        for title in titles_list:
            mid_title = str(title).replace('\n', '').replace('\t', '').strip()
            titles.append(mid_title)
        # 获取关键字
        KEY_mid = params.customized['KEY']
        KEY = Common.urldec(KEY_mid)
        # 获取标题正则表达式
        titlePatten = KEY
        # 获取一周前日期
        today = datetime.datetime.now()
        before_days = today + datetime.timedelta(-self.inputtime)
        before_arr = str(before_days).split('.')
        before_time = before_arr[0]

        urllist = []
        len_hrefs = len(hrefs)
        number = 0
        for index in publicTimes[:len_hrefs]:
            # 是否是标题命中
            # mid_value = re.compile(titlePatten)
            # flg = mid_value.search(str(titles[number]))
            flg = Common.checktitle(titlePatten, str(titles[number]))
            # 是当前一周内发布视频，并且标题命中的场合
            if index > before_time and flg:
                url = hrefs[number]
                urllist.append(url)
            number = number + 1

        # 获取最终url列表
        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)

Example #12

0

Show file

File: laohucomments.py Project: ErBingBing/django-tonado-crawler

 def step1(self, params):
     Logger.getlogging().info("LaohuComments.STEP_1")
     #1. 根据输入原始url, 得到网站的子域名
     field = self.r.parse('^http://(\w+)\.laohu\.com/.*', params.originalurl)[0]
     # 论坛
     if field == 'bbs':
         # 拼接获取uniqid的url
         self.storeurl(params.originalurl, params.originalurl, LaohuComments.STEP_2_BBS,{'field': field})
     else:
         # 非论坛页面  http://ff.laohu.com/201612/215072.html
         xhtml = XPathUtility(html=params.content)
         token = xhtml.getlist('// *[ @ id = "t_token"]')[0]
         sourceId = self.r.getid('source_id', params.content, '\s*=\s*')
         # 拼接第一页评论url
         COMMENTS_URL = 'http://member.laohu.com/comment/show/?token=%s&oder=new'
         comments_url = LaohuComments.COMMENTS_URL % (token)
         # 通知下载平台，根据评论url获取第一页评论内容
         self.storeurl(comments_url, params.originalurl, LaohuComments.STEP_2, {'token' : token, 'sourceId':sourceId})

Example #13

0

Show file

 def download(self):
     doneurl = TencentDownloader.DONE_FILE_URL.format(
         taskid=self.taskinfo.taskid)
     html = TencentDownloader.httpget(doneurl)
     if html:
         xparse = XPathUtility(html)
         for donefile in xparse.getlist(r'//tr/td[2]/a'):
             if donefile.endswith(
                     'done') and donefile not in self.downloadedfiles:
                 for upfile in self.upload_file_list:
                     if donefile.startswith(upfile):
                         FileUtility.mkdirs(self.download_path)
                         self.execute(
                             TencentDownloader.DOWNLOAD_COMMAND.format(
                                 taskid=self.taskinfo.taskid,
                                 filename=donefile))
                         FileUtility.move('./' + donefile,
                                          self.download_path)
                         break
                 self.downloadedfiles.append(donefile)
     return tencentdownloader.TencentDownloader.download(self)

Example #14

0

Show file

    def step2news(self, params):
        Logger.getlogging().info("ZolbbsComments.STEP_2")
        kindid = params.customized['kindid']
        docurl = params.customized['docurl']
        xparser = XPathUtility(params.content)
        comments_count = int(xparser.getlist('//*[@class="comment-num"]')[0])
        # 判断增量
        cmtnum = CMTStorage.getcount(params.originalurl, True)
        if cmtnum >= comments_count:
            return
        NewsStorage.setcmtnum(params.originalurl, comments_count)

        page_num = int(
            math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
        if page_num >= self.maxpages:
            page_num = self.maxpages

        for page in range(1, page_num + 1, 1):
            comment_url = self.COMMENT_URL_NEWS.format(kindid=kindid,
                                                       docurl=docurl,
                                                       page=page)
            self.storeurl(comment_url, params.originalurl,
                          ZolnewsComments.STEP_3)

Example #15

0

Show file

File: newscomments.py Project: ErBingBing/django-tonado-crawler

    def geturlcomments(self, params):
        # 获取具体评论
        xparser = XPathUtility(params.content)
        comments_xpath = xparser.xpath('//*[contains(@id, "cm_")]')
        if not comments_xpath:
            return

        # 获取发布时间
        ip_pubtimes_xpath = xparser.getlist('//*[contains(@id,"CList___CommentList_UserLink_")]/..')

        if len(comments_xpath) == len(ip_pubtimes_xpath):
            comments = []
            # 获取评论
            for index in range(0, len(comments_xpath), 1):
                cmti = CommentInfo()
                if URLStorage.storeupdatetime(params.originalurl, getuniformtime(ip_pubtimes_xpath[index])):
                    # 获取增加的评论（根据时间比较）
                    cmti.content = comments_xpath[index].text
                    comments.append(cmti)

            # 保存获取的评论
            if len(comments) > 0:
                self.commentstorage.store(params.originalurl, comments)

Example #16

0

Show file

File: rain8Comments.py Project: ErBingBing/django-tonado-crawler

    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is Rain8Comments.STEP_1:
                #Step1: 通过得到docurl，得到获取评论的首页url参数。
                articleId = self.r.parse('http://\w+\.tadu\.com/\w+/(\d+).*', params.originalurl)[0]

                # 取得评论的url列表
                comments_url = Rain8Comments.COMMENT_URL.format (articleId = articleId,page = 1)
                self.storeurl(comments_url, params.originalurl, Rain8Comments.STEP_2, {'articleId': articleId})

            elif params.step == Rain8Comments.STEP_2:
                # 获得评论参数
                articleId = params.customized['articleId']

                # 取得总件数
                #comment_count = float(self.r.getid('total', params.content))
                xparser = XPathUtility(params.content)
                countstr = xparser.getstring('//h4')
                if self.r.search(u'\d+', countstr):
                    comment_count = self.r.parse(u'(\d+)', countstr)[1]
                if comment_count == 0:
                    return

                # 判断增量
                cmtnum = URLStorage.getcmtnum(params.originalurl)
                if cmtnum >= comment_count:
                    return
                URLStorage.setcmtnum(params.originalurl, comment_count)

                # 获取页数
                totalPage = int(math.ceil(float(comment_count) / TaDuComments.PAGE_SIZE))

                # 获得url列表
                for page in range(1, totalPage+1 , 1):
                    url = TaDuComments.COMMENT_URL.format(articleId = articleId,page = page)
                    self.storeurl(url, params.originalurl, TaDuComments.STEP_3)

            elif params.step == TaDuComments.STEP_3:
                # Step3: 通过Step2设置的url，得到所有评论，抽取评论
                Logger.getlogging().info("params.step == 3")
                # 取得所有评论
                xparser = XPathUtility(params.content)
                comments = xparser.getlist('//ul[@class="cmetlist bookreview-cmetlist"]/li/div/div[2]/p')

                # 取得所有评论时间
                commenttimes = xparser.getlist('//ul[@class="cmetlist bookreview-cmetlist"]/li/div/div[2]/span')

                commentsInfo = []
                # 取得所有评论
                for index in range(0, int(len(comments)), 1):
                    # 提取时间
                    publicTime = commenttimes[index][3:]
                    cmti = CommentInfo()
                    tm = TimeUtility.getuniformtime(publicTime,'%Y-%m-%d %H:%M')
                    if URLStorage.storeupdatetime(params.originalurl, tm):
                       cmti.content = comments[index].strip()
                       commentsInfo.append(cmti)
                    # 保存获取的评论
                if len(commentsInfo) > 0:
                    self.commentstorage.store(params.originalurl, commentsInfo)
            else:
                Logger.getlogging().error('proparam.step == {step}'.format(step = params.step))
        except Exception,e:
            traceback.print_exc()

Example #17

0

Show file

File: ishangmanComments.py Project: ErBingBing/django-tonado-crawler

    def process(self, proparam):
        Logger.getlogging().info(proparam.url)
        try:
            if proparam.step is ishangmanComments.STEP_1:
                # 取得url中的参数值
                articleIds = re.findall(
                    r'^http://(\w+)\.ishangman\.com/\w+/(\d+)',
                    proparam.url).__getitem__(0)
                articleId1 = articleIds.__getitem__(0)
                articleId2 = articleIds.__getitem__(1)
                # 评论类型
                commenttype = int(
                    self.r.parse(ur'commenttype = (.*);', proparam.content)[0])
                #第一页评论
                url = ishangmanComments.COMMENTS_URL % (articleId1, articleId2,
                                                        commenttype, 1)
                self.storeurl(
                    url, proparam.originalurl, ishangmanComments.STEP_2, {
                        'articleId1': articleId1,
                        'articleId2': articleId2,
                        'commenttype': commenttype
                    })

            elif proparam.step == ishangmanComments.STEP_2:
                articleId1 = proparam.customized['articleId1']
                articleId2 = proparam.customized['articleId2']
                commenttype = proparam.customized['commenttype']
                # 取得评论件数
                xhtml = XPathUtility(html=proparam.content)
                if articleId1.__eq__('comic'):
                    comments_count = int(
                        xhtml.getlist(
                            '//*[contains(@class,"ismcartondiv1")]/p/strong')
                        [0])
                    if comments_count:
                        NewsStorage.setcmtnum(proparam.originalurl,
                                              comments_count)
                else:
                    comments_count = int(
                        self.r.parse(
                            ur'(\d+).*',
                            xhtml.getlist('//*[@class="comment_lctwidl"]/p')
                            [0])[0])
                    if comments_count:
                        NewsStorage.setcmtnum(proparam.originalurl,
                                              comments_count)
                # 取得评论的页数
                cmtnum = CMTStorage.getcount(proparam.originalurl, True)
                if int(comments_count) == 0:
                    return
                page_num = int(
                    math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
                if page_num >= self.maxpages:
                    page_num = self.maxpages

                # 取得评论的url
                for page in range(1, page_num + 1, 1):
                    url = ishangmanComments.COMMENTS_URL % (
                        articleId1, articleId2, commenttype, page)
                    self.storeurl(url, proparam.originalurl,
                                  ishangmanComments.STEP_3,
                                  {'articleId1': articleId1})

            elif proparam.step == ishangmanComments.STEP_3:
                try:
                    Logger.getlogging().debug(proparam.originalurl)
                    commentsInfo = []
                    articleId1 = proparam.customized['articleId1']
                    xparser = XPathUtility(proparam.content)
                    # 取得评论件数
                    if articleId1.__eq__('comic'):
                        # 论坛评论
                        soup = BeautifulSoup(proparam.content, 'html5lib')
                        comments = soup.select('.ismcartondiv2')
                    else:
                        # 论坛评论
                        comments = xparser.getcomments(
                            '/html/body/div/span[2]/p[1]')
                        # 论坛评论时间
                        updateTime = xparser.getcomments(
                            '/html/body/div/span[2]/div[1]')

                    # 取得评论
                    for index in range(0, int(len(comments)), 1):
                        cmti = []
                        if articleId1.__eq__('comic'):
                            publictime = self.r.parse(
                                ur'(\d{2}-\d+ \d+:\d+)',
                                comments[index].get_text())[0]
                            # publictime  = TimeUtility.getuniformtime(publictime)
                            if publictime:
                                cmt_month = publictime.split("-")[0]
                                curmonth = time.localtime().tm_mon
                                if (int(cmt_month) < curmonth):
                                    publictime = TimeUtility.getcurrentdate(
                                    )[0:4] + '-' + publictime
                                else:
                                    publictime = '2016' + '-' + publictime
                            curtime = TimeUtility.getuniformtime(publictime)
                            content = comments[index].text.split(
                                '\n')[0].get_text()

                            # # print comments;
                            # return
                            # content = self.r.parse(ur'class=\".*\"',comments[index].get_text())[0]
                            # nick = comments[1].get('nickname', 'anonymous')
                            #
                            # if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
                            #     CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
                            # if NewsStorage.storeupdatetime(proparam.originalurl, tm):
                            #     cmti.content = comments[index].get_text()
                            #     commentsInfo.append(cmti)
                        else:
                            publictime = updateTime[index][:-8]
                            #publictime = TimeUtility.getcurrentdate()[0:4] + '-'+ publictime
                            #tm = TimeUtility.getuniformtime(publictime, u'%Y-%m-%d %H:%M')
                            tm = getuniformtime(publictime)
                            if NewsStorage.storeupdatetime(
                                    proparam.originalurl, tm):
                                cmti.content = comments[index]
                                commentsInfo.append(cmti)

                        # 保存获取的评论i
                    if len(commentsInfo) > 0:
                        self.commentstorage.store(proparam.originalurl,
                                                  commentsInfo)

                except:
                    Logger.printexception()
                    Logger.getlogging().error(
                        'extract comment error from {site}'.format(
                            site=proparam.url))
            else:
                Logger.getlogging().error("proparam.step == %d", proparam.step)

        except Exception, e:
            traceback.print_exc()