Esempio n. 1
0
    def getpagecomments(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href')
        titles = xpath.getlist('//*[@class="sosResult"]/strong/a')
        pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformtime(
                    pubtimes[index].text).split(' ')[0]
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT)
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= int(self.querylastdays):
                    newurl = self.preprocess(hrefs[index])
                    if newurl is not None:
                        urllist.append(newurl)

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Esempio n. 2
0
    def step2bbs(self, params):
        Logger.getlogging().info("Dm5Commnets.STEP_2")
        # 将STEP_1中的docurl传下来
        docurl = params.customized['docurl']

        comments_count = self.r.parse(ur'(\d+)个回复', params.content)[0]
        # 判断增量
        cmtnum = URLStorage.getcmtnum(params.originalurl)
        if cmtnum >= comments_count:
            return
        URLStorage.setcmtnum(params.originalurl, comments_count)

        # 总数除以page_size,然后加1,可得到评论总页数comments_count
        pagenum = 0
        xparser = XPathUtility(params.content)
        if not xparser.xpath('//*[@class="inkk ma5"]'):
            Logger.getlogging().warning('{0}:30001'.format(params.originalurl))
            return
        pageList = xparser.xpath('//*[@id="search_fy"]/a/text()')
        if not pageList:
            pagenum = 1
        else:
            pagenum = int(pageList[-2])

        for page in range(1, pagenum + 1, 1):
            comment_url = Dm5Commnets.COMMENT_URL.format(docurl=docurl,
                                                         page=page)
            self.storeurl(comment_url, params.originalurl,
                          Dm5Commnets.STEP_3_BBS)
    def getcomments_step2(self, params):
        bookId = params.customized['bookId']
        xhtml = XPathUtility(html=params.content)
        page_counts = int(xhtml.xpath('//div[@class="page"]/@pagenum')[0])
        comments_count = int(xhtml.xpath('//div[@class="page"]/@total')[0])
        Logger.getlogging().debug(comments_count)
        if page_counts == 0:
            return
        # 判断增量
        cmtnum = CMTStorage.getcount(params.originalurl, True)
        if cmtnum >= comments_count:
            return

        page_num = int(
            math.ceil(float(comments_count - cmtnum) / self.PAGE_SIZE))
        if page_num >= self.maxpages:
            page_num = self.maxpages
        NewsStorage.setcmtnum(params.originalurl, comments_count)

        for page in range(1, page_num + 1, 1):
            comment_url = PubComments.COMMENTS_URL
            self.storeposturl(comment_url, params.originalurl,
                              PubComments.STEP_3, {
                                  'bookId': bookId,
                                  'pageNum': page
                              })
Esempio n. 4
0
 def baidutiebasearch_step2(self, params):
     # Step2: 根据返回内容,通过xpath: //*[@class="nums"] 得到最大总条数
     # 获取第一页的搜索结果
     self.baidutiebasearch_step3(params)        
     # 获取尾页page数
     xparser = XPathUtility(html=params.content)
     pager_search = xparser.xpath('//*[@class="pager pager-search"]')
     queryurl = ''
     if pager_search:
         tailpageurl = xparser.xpath('//*[@class="pager pager-search"]/a[last()]/@href')
         try:
             if tailpageurl:
                 lists = tailpageurl[0].split('pn=')
                 queryurl = 'http://tieba.baidu.com'+lists[0]
                 tailpage = int(lists[1])
                 if tailpage > BaiduTiebaS2Query2.DEFAULT_MAX_PAGESIZE:
                     tailpage = BaiduTiebaS2Query2.DEFAULT_MAX_PAGESIZE
                 if tailpage > self.maxpages:
                     tailpage = self.maxpages                
             else:
                 tailpage = 1                
         except:
             tailpage = 1
     else:
         # 没有检索结果,直接返回
         Logger.log(params.url, constant.ERRORCODE_EXCEPTTION_JSON)
         return
     if not queryurl:
         return
     # 根据上面的tailpage数,拼出除了第一页之外的所有的搜索结果url
     querylist = []
     for page in range(2, tailpage + 1, 1):
         url = queryurl + 'pn={page}'.format(page=page)
         querylist.append(url)
     self.__storeqeuryurllist__(querylist, BaiduTiebaS2Query2.BAIDU_TIEBA_SEARCH_EACH_PAGE)
    def getsearchresult(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//li/h3/a/@href')
        titles = xpath.getlist('//li/h3/a')
        pubtimes = xpath.xpath('//li/p')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text)
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= self.querylastdays:
                    urllist.append(hrefs[index])
                else:
                    # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。
                    break

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 6
0
    def pageprocess(self, params):
        # 获取文本
        xparser = XPathUtility(params.content)
        # 获取该页超级链接
        hreflist = xparser.xpath('//h3/a/@href')
        hrefs = []
        for mid_url in hreflist:
            mid = self.preprocess(mid_url)
            if mid is not None:
                hrefs.append(mid)

        # 获取该页内容的所有发布时间
        publictime = xparser.xpath('//*[@class="scontent"]/text()[1]')
        publicTimes = []
        for timeindex in publictime:
            middle = str(timeindex).replace('\n', '').replace('\t', '').strip()
            publicTimes.append(
                str(str(middle).split(' ')[0]) + ' ' +
                str(str(middle).split(' ')[1]))
        # 获取改页所有title
        titles = []
        titles_list = xparser.getlist('//h3')
        for title in titles_list:
            mid_title = str(title).replace('\n', '').replace('\t', '').strip()
            titles.append(mid_title)
        # 获取关键字
        KEY_mid = params.customized['KEY']
        KEY = Common.urldec(KEY_mid)
        # 获取标题正则表达式
        titlePatten = KEY
        # 获取一周前日期
        today = datetime.datetime.now()
        before_days = today + datetime.timedelta(-self.inputtime)
        before_arr = str(before_days).split('.')
        before_time = before_arr[0]

        urllist = []
        len_hrefs = len(hrefs)
        number = 0
        for index in publicTimes[:len_hrefs]:
            # 是否是标题命中
            # mid_value = re.compile(titlePatten)
            # flg = mid_value.search(str(titles[number]))
            flg = Common.checktitle(titlePatten, str(titles[number]))
            # 是当前一周内发布视频,并且标题命中的场合
            if index > before_time and flg:
                url = hrefs[number]
                urllist.append(url)
            number = number + 1

        # 获取最终url列表
        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
    def step1(self, params):
        # 获得首页url参数
        info = params.customized['query']
        xparser = XPathUtility(params.content)
        if not xparser.xpath('//*[@class="mytopic topiclisttr"]'):
            Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS)
            return
        pageList = xparser.getcomments('//span[@class="right"]/a')
        if len(pageList) == 1:
            pageTotal = 1
        else:
            pageTotal = pageList[len(pageList) - 2]

        if int(pageTotal) >= self.maxpages:
            pageTotal = self.maxpages

        # 所有循环列表
        querylist = []

        # 根据总页数,获取query列表
        for page in range(1, int(pageTotal) + 1, 1):
            if page == 1:
                self.step2(params)
                continue
            url = hupuS2Query.HUPU_QUERY_TEMPLATE.format(q=info, pn=page)
            querylist.append(url)
        self.__storeqeuryurllist__(querylist,
                                   hupuS2Query.HUPU_S2QUERY_EACH_PAGE,
                                   {'query': info})
    def geturlcomments(self, params):
        # 获取具体评论
        xparser = XPathUtility(params.content)
        comments_xpath = xparser.xpath('//*[@id="short_comment_content"]')
        if not comments_xpath:
            return

        # 获取发布时间
        ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]')

        if len(comments_xpath) == len(ip_pubtimes_xpath):
            comments = []
            # 获取评论
            for index in range(0, len(comments_xpath), 1):
                cmti = CommentInfo()
                publicTime = ip_pubtimes_xpath[index]
                if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime):
                    publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+',
                                                     publicTime)[0]

                if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime):
                    publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+',
                                              publicTime)[0]

                if URLStorage.storeupdatetime(params.originalurl,
                                              getuniformtime(publicTime)):
                    # 获取增加的评论(根据时间比较)
                    cmti.content = comments_xpath[index].text
                    comments.append(cmti)
    def step1(self, params):
        Logger.getlogging().info("DmOneTwoThreeNewsComments.STEP_1")
        id = self.r.parse('^http://www.dm123.cn/.*/(\d+).html', params.originalurl)[0]
        xparser = XPathUtility(params.content)
        classid = xparser.xpath("//input[@id='classid']/@value")[0]

        # 1. 根据输入原始url, 拼出评论首页
        commentinfo_url = Dm123NewsComments.COMMENT_URL.format(page=0, classid=classid, id=id)
        # 评论首页URL
        # 论坛
        self.storeurl(commentinfo_url, params.originalurl, Dm123NewsComments.STEP_2, {'classid': classid, 'id': id})
    def getkurlcomments(self, params):
        xparser = XPathUtility(params.content)
        # 获取评论列表
        comments_xpath = xparser.xpath('//*[@class="page-pl-list-text"]')
        # 获取评论时间
        pubtime_xpath = xparser.xpath('//*[@class="page-pl-user-timer"]')

        if len(comments_xpath) >= len(pubtime_xpath):
            start = len(comments_xpath) - len(pubtime_xpath)
            comments = []
            for index in range(start, len(comments_xpath), 1):
                if URLStorage.storeupdatetime(
                        params.originalurl,
                        getuniformtime(pubtime_xpath[index].text)):
                    cmti = CommentInfo()
                    cmti.content = comments_xpath[index].text
                    comments.append(cmti)

            # 保存获取到的评论
            if len(comments) > 0:
                self.commentstorage.store(params.originalurl, comments)
    def geturlcomments(self, params):
        # 获取具体评论
        xparser = XPathUtility(params.content)
        comments_xpath = xparser.xpath('//*[contains(@id, "cm_")]')
        if not comments_xpath:
            return

        # 获取发布时间
        ip_pubtimes_xpath = xparser.getlist('//*[contains(@id,"CList___CommentList_UserLink_")]/..')

        if len(comments_xpath) == len(ip_pubtimes_xpath):
            comments = []
            # 获取评论
            for index in range(0, len(comments_xpath), 1):
                cmti = CommentInfo()
                if URLStorage.storeupdatetime(params.originalurl, getuniformtime(ip_pubtimes_xpath[index])):
                    # 获取增加的评论(根据时间比较)
                    cmti.content = comments_xpath[index].text
                    comments.append(cmti)

            # 保存获取的评论
            if len(comments) > 0:
                self.commentstorage.store(params.originalurl, comments)
    def process(self, params):
        try:
            if params.step is None:
                # 从url中获取拼接评论url的参数
                if not self.r.search(
                        '^http[s]{0,1}://www\.fun\.tv/vplay/\w-(\d+)(\.\w-\d+)?/$',
                        params.originalurl):
                    return
                galleryid = self.r.parse(
                    '^http[s]{0,1}://www\.fun\.tv/vplay/\w-(\d+)(\.\w-\d+)?/$',
                    params.originalurl)[0][0]
                # 拼接第一页评论url
                comments_url = FunComments.COMMENTS_URL % (galleryid, 1)
                #通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              FunComments.STEP_2, {'galleryid': galleryid})

                #直接通过拼页面获取,除电视剧存在此种合辑问题,其他都可以直接获取
                xhtml = XPathUtility(params.content)
                torrent_panel = xhtml.xpath('//*[@class="torrent-panel"]')
                if torrent_panel:
                    lis = xhtml.xpath('//*[@class="torrent-panel"]/ul/li')
                    if len(lis) == 0:
                        return
                    numobj = xhtml.xpath(
                        '//*[@class="playInfo crumbs"]/div/a[@class="exp-num"]'
                    )
                    if numobj:
                        clicknum = self.str2num(numobj[0].text)
                        new_clicknum = int(clicknum) / len(lis)
                        NewsStorage.setclicknum(params.originalurl,
                                                new_clicknum)

            #获取第一页评论内容,循环获取全部评论url
            elif params.step == FunComments.STEP_2:
                galleryid = params.customized['galleryid']
                # 获取评论的Jason返回值
                comments = json.loads(params.content)
                # 比较上次抓取该url的页面评论量和当前取到的评论量
                curcmtnum = int(comments['data']['total_num'])
                NewsStorage.setcmtnum(params.originalurl, curcmtnum)
                dbcmtnum = CMTStorage.getcount(params.originalurl, True)
                if dbcmtnum >= curcmtnum:
                    return
# 循环取得评论的url
                pages = int(
                    math.ceil(float(curcmtnum - dbcmtnum) / self.PAGE_SIZE))
                if pages >= self.maxpages:
                    pages = self.maxpages
                for page in range(1, pages + 1, 1):
                    if page == 1:
                        self.step3(params)
                        continue
                    commentUrl = FunComments.COMMENTS_URL % (galleryid, page)
                    self.storeurl(commentUrl, params.originalurl,
                                  FunComments.STEP_3, {'galleryid': galleryid})
            #解析评论数据
            elif params.step == FunComments.STEP_3:
                self.step3(params)
        except:
            Logger.printexception()
    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is Xie17NewsComments.STEP_1:
                #Step1: 通过得到docurl,得到获取评论的首页url参数。
                articleId = self.r.parse('^http://xiaoshuo\.17xie\.com/book/(\d+)/', params.originalurl)[0]

                # 取得评论的url列表
                comments_url = Xie17NewsComments.COMMENT_URL % (articleId, 1)
                self.storeurl(comments_url, params.originalurl, Xie17NewsComments.STEP_2, {'articleId': articleId})

            elif params.step == Xie17NewsComments.STEP_2:
                # 获得评论参数
                articleId = params.customized['articleId']

                # 取得总件数
                comment_count = float(self.r.parse(ur'共(\d+)人说过', params.content)[0])
                if comment_count == 0:
                    return

                # 判断增量
                cmtnum = URLStorage.getcmtnum(params.originalurl)
                if cmtnum >= comment_count:
                    return
                URLStorage.setcmtnum(params.originalurl, comment_count)

                # 获取页数
                page = int(math.ceil(comment_count / Xie17NewsComments.PAGE_SIZE))

                # 获得url列表
                for page in range(1, page + 1, 1):
                    url = Xie17NewsComments.COMMENT_URL % (articleId, page)
                    self.storeurl(url, params.originalurl, Xie17NewsComments.STEP_3)

            elif params.step == Xie17NewsComments.STEP_3:
                # Step3: 通过Step2设置的url,得到所有评论,抽取评论
                Logger.getlogging().info("params.step == 3")
                xparser = XPathUtility(params.content)
                # 取得所有评论
                comments = xparser.getcomments('/html/body/ul/li[2]/dl/dd')
                # 取得所有评论时间
                commenttimes = xparser.xpath('/html/body/ul/li[2]/dl/dt/text()')

                commentsInfo = []
                # 取得所有评论
                for index in range(0, int(len(commenttimes)), 1):
                    # 提取时间
                    if self.r.search(ur'\d+年\d+月',commenttimes[index].strip()):
                        tm = TimeUtility.getuniformtime(str(commenttimes[index]).strip(), '%Y年%m月')
                    else:
                        tm = getuniformtime(commenttimes[index].strip())

                    if URLStorage.storeupdatetime(params.originalurl, tm):
                        cmti = CommentInfo()
                        comment = comments[index * 3] + comments[index * 3 + 1] + comments[index * 3 + 2]
                        cmti.content = comment
                        commentsInfo.append(cmti)

                    # 保存获取的评论
                if len(commentsInfo) > 0:
                    self.commentstorage.store(params.originalurl, commentsInfo)
            else: