Beispiel #1
0
    def step2(self, params):
        """"""
        try:
            key = params.customized['key']
            soup = BeautifulSoup(params.content, 'html5lib')
            #print soup
            #searchListOne = soup.select('.searchListOne > ul')
            searchListOne = soup.select('.searchListOne > ul > li > div')
            if not searchListOne:
                Logger.getlogging().warning('{}:40000 No urllist'.format(
                    params.originalurl))
                return
            lis = soup.select(
                '.searchListOne > ul > li'
            )[:-1]  #最后一个<li id=search_msg style="display:none"></li>,过滤掉
            urllist = []
            for li in lis:
                url = li.select_one('h3 > a').get('href')
                #print '*********',url
                tm = li.select('.source > span')[0].get_text()
                tm = getuniformtime(tm)
                now = getuniformtime(str(time.time()))
                cmt_num = li.select('.source > span')[-1].get_text()

                title = li.select_one('h3').get_text()
                if Common.checktitle(Common.urldec(key), title):
                    if compareNow(tm, self.querylastdays):
                        urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
        except:
            #traceback.print_exc()
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))
Beispiel #2
0
    def step3news(self, params):
        Logger.getlogging().info("ZolbbsComments.STEP_3")
        # Step3: 通过Step2设置的url,得到所有评论,抽取评论
        xparser = XPathUtility(params.content)
        commentsinfo = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="commli"]/p')
        commentstime = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="published-time"]')
        commentsnick = xparser.getcomments(
            '//*[@class="comment-list-new"]//*[@class="user-name"]')
        # 获取评论,设置实际的评论量
        for index in range(0, len(commentstime), 1):
            # 提取时间
            tm = commentstime[index].strip()
            try:
                curtime = TimeUtility.getuniformtime(getuniformtime(tm),
                                                     u'%Y-%m-%d %H:%M')
            except Exception, e:
                curtime = getuniformtime(tm)

            # 提取评论内容
            content = commentsinfo[index]
            nick = commentsnick[index]
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)
    def process(self, params):
        try:
            if params.step is ThepaperComments.STEP_1:
                # 根据url获取拼接评论的参数
                contid = params.originalurl.split('_')
                contid = contid[-1]
                # 拼接初始评论url
                comments_url = ThepaperComments.SOURCE_COMMENTS_URL.format(
                    contid=contid)
                # 通知下载平台,根据评论url获取第一页评论内容
                self.storeurl(comments_url, params.originalurl,
                              ThepaperComments.STEP_2, {'contid': contid})

            elif params.step == ThepaperComments.STEP_2:
                contid = params.customized['contid']
                soup = BeautifulSoup(params.content, 'html5lib')
                divs = soup.find_all(attrs={
                    'id': re.compile('comment'),
                    'class': 'comment_que'
                })
                if not divs:
                    return

                if self.r.search(ur'startId=(.*)', params.url):
                    for index in range(1, len(divs), 1):
                        tm = divs[index].select_one(
                            '.aqwright > h3 > span').get_text()
                        curtime = getuniformtime(tm)
                        content = divs[index].select_one(
                            '.aqwright > .ansright_cont > a').get_text()
                        nick = divs[index].select_one(
                            '.aqwright > h3  > a').get_text()
                        if not CMTStorage.exist(params.originalurl, content,
                                                curtime, nick):
                            CMTStorage.storecmt(params.originalurl, content,
                                                curtime, nick)

                else:
                    for index in range(0, len(divs), 1):
                        tm = divs[index].select_one(
                            '.aqwright > h3 > span').get_text()
                        curtime = getuniformtime(tm)
                        content = divs[index].select_one(
                            '.aqwright > .ansright_cont > a').get_text()
                        nick = divs[index].select_one(
                            '.aqwright > h3  > a').get_text()

                        if not CMTStorage.exist(params.originalurl, content,
                                                curtime, nick):
                            CMTStorage.storecmt(params.originalurl, content,
                                                curtime, nick)

                if self.r.search(ur'startId=(.*)', params.url):
                    hotIds = params.customized['hotIds']
                else:
 def getComments(self, params, url):
     # 当前评论页码
     pg = self.r.parse(url, params.url)[0]
     soup = BeautifulSoup(params.content, 'html5lib')
     # 帖子内容
     infos = soup.select('tr > td.postcontent')
     # 发表时间,内容格式[发表于 2016-10-7 18:04:25]
     comments = []
     # 第一页的第一条内容为正文
     if pg == '1':
         start = 1
     else:
         start = 0
     for info in infos[start:]:
         # 取主评论
         if info.select_one('div[class="postmessage defaultpost"]'):
             content = info.select_one('div[class="postmessage defaultpost"]').get_text()\
                 .replace('\t','').replace('\n','').replace(' ','').strip()
             updatetime = info.select_one(
                 'div.postinfo > font').get_text().strip()[4:] + ':00'
             curtime = getuniformtime(updatetime)
             nick = 'none'
             if not CMTStorage.exist(params.originalurl, content, curtime,
                                     nick):
                 CMTStorage.storecmt(params.originalurl, content, curtime,
                                     nick)
     comments_couts = CMTStorage.getcount(params.originalurl)
     NewsStorage.setcmtnum(params.originalurl, comments_couts)
    def step3bbs(self, params):
        Logger.getlogging().info("Tmtpostcommnets.STEP_3")
        # Step3: 通过Step2设置的url,得到所有评论,抽取评论
        commentsinfo = json.loads(params.content)
        comments = []

        #for index in range(0, int(len(commentsinfo['data'])), 1):
        ## 提取时间
        #cmti = CommentInfo()
        #cmti.content = commentsinfo['data'][index]['comment']
        #tm = TimeUtility.getuniformtime(commentsinfo['data'][index]['time_created'], u'%Y-%m-%d %H:%M')
        #if URLStorage.storeupdatetime(params.originalurl, tm):
        #comments.append(cmti)

        jsondata = commentsinfo['data']
        if not jsondata:
            return
        for data in jsondata:
            cmti = CommentInfo()
            cmti.content = data['comment']
            tm = gettimeutil.getuniformtime(data['time_created'])
            if URLStorage.storeupdatetime(params.originalurl, tm):
                comments.append(cmti)

        # 保存获取的评论
        if len(comments) > 0:
            self.commentstorage.store(params.originalurl, comments)
Beispiel #6
0
 def step2(self,params):
     soup = BeautifulSoup(params.content, 'html5lib')
     if soup.find(attrs={"id":re.compile('noresult_part._container')}):
         Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url))
         return 
     results = soup.select('.results > .vrwrap')
     if not results:
         Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url))
         return 
     urllist = []
     newurllist = []
     for item in results:
         try:
             if not item.select_one('h3.vrTitle > a'):
                 continue
             title = item.select_one('h3.vrTitle > a').get_text()
             href = item.select_one('h3.vrTitle > a').get('href')
             timestr = item.select_one('.news-detail > .news-info > .news-from').get_text()
             times = getuniformtime(timestr)
             Logger.getlogging().debug('title:'+ title)
             Logger.getlogging().debug('time:'+ times)
             if compareNow(times, self.querylastdays):
                 Logger.getlogging().debug('href:'+ href)
                 urllist.append(href)
             newitem = item.select_one('#news_similar')
             if newitem:
                 newhref = 'http://news.sogou.com/news'+newitem.get('href')
                 Logger.getlogging().debug('newhref:'+ newhref)
                 newurllist.append(newhref)
         except:
             Logger.printexception()
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS)      
     if len(newurllist) > 0:
         self.__storeqeuryurllist__(newurllist, self.NEWS_EACH_2)
    def geturlcomments(self, params):
        # 获取具体评论
        xparser = XPathUtility(params.content)
        comments_xpath = xparser.xpath('//*[@id="short_comment_content"]')
        if not comments_xpath:
            return

        # 获取发布时间
        ip_pubtimes_xpath = xparser.getlist('//*[@id="short_comment_left"]')

        if len(comments_xpath) == len(ip_pubtimes_xpath):
            comments = []
            # 获取评论
            for index in range(0, len(comments_xpath), 1):
                cmti = CommentInfo()
                publicTime = ip_pubtimes_xpath[index]
                if self.r.search(ur'\d{2}-\d+-\d+ \d+:\d+', publicTime):
                    publicTime = '20' + self.r.parse(ur'\d{2}-\d+-\d+ \d+:\d+',
                                                     publicTime)[0]

                if self.r.search(ur'\d+/\d+/\d+ \d+:\d+:\d+', publicTime):
                    publicTime = self.r.parse(ur'\d+/\d+/\d+ \d+:\d+:\d+',
                                              publicTime)[0]

                if URLStorage.storeupdatetime(params.originalurl,
                                              getuniformtime(publicTime)):
                    # 获取增加的评论(根据时间比较)
                    cmti.content = comments_xpath[index].text
                    comments.append(cmti)
Beispiel #8
0
 def step2_2(self, params):
     """"""
     try:
         jsondata = json.loads(params.content)
         data = jsondata['data']
         soup = BeautifulSoup(data, 'html5lib')
         divs = soup.select('.comment')
     except:
         Logger.getlogging().warning(
             '{url}:30000 No comments'.format(url=params.originalurl))
         return
     #comments_total = len(divs)
     #cmtnum = URLStorage.getcmtnum(params.originalurl)
     #if cmtnum >= comments_total:
     #return
     #URLStorage.setcmtnum(params.originalurl, comments_total)
     comments = []
     #divs.reverse()
     for div in divs:
         cmti = CommentInfo()
         cmti.content = div.find(attrs={
             'style': re.compile('padding-top')
         }).get_text().strip()
         tm = div.select_one('.show-time').get_text()
         tm = getuniformtime(tm)
         if not tm:
             continue
         if URLStorage.storeupdatetime(params.originalurl, tm):
             comments.append(cmti)
     if len(comments) > 0:
         # 保存获取的评论
         self.commentstorage.store(params.originalurl, comments)
 def step2(self, params):
     """"""
     q = params.customized['query']
     soup = BeautifulSoup(params.content, 'html5lib')
     divs = soup.select('.videobox')
     if not divs:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     urllist = []
     for div in divs:
         title = div.select_one('.title').get_text()
         #print title
         tm = getuniformtime(div.select_one('.date').get_text())
         url = div.select_one('.title > a').get('href')
         Logger.getlogging().debug(title)
         if not compareNow(tm, self.querylastdays):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
             continue
         if not Common.checktitle(Common.urldec(q), title):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             continue
         urllist.append(url)
         #获取最终url列表
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Beispiel #10
0
 def baidutiebasearch_step3(self, params):
     soup = BeautifulSoup(params.content, 'html5lib')
     post_list = soup.select('.s_post_list > .s_post')
     urllist = []
     for item in post_list:
         try:
             title = item.select_one('.p_title > a').get_text().strip()
             href = item.select_one('.p_title > a').get('href') 
             pubtimeobj = item.find(attrs={'class':'p_green p_date'})
             if not pubtimeobj:
                 Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
                 continue
             pubtime = pubtimeobj.get_text()
             pubtime = getuniformtime(pubtime)
             Logger.getlogging().debug(title)
             Logger.getlogging().debug(pubtime)
             if self.isyestoday(pubtime):
                 Logger.getlogging().debug('https://tieba.baidu.com'+href)
                 urllist.append('https://tieba.baidu.com'+href) 
             else:
                 Logger.log(params.url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         except:
             Logger.printexception()
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)        
Beispiel #11
0
    def process(self, params):
        try:
            if params.step is ChinabyteComments.STEP_1:
                threadid = self.r.parse('data-thread-key=\"(.*?)\"',
                                        params.content)
                if not threadid:
                    return
                comments_url = ChinabyteComments.COMMENTS_URL % (threadid[0],
                                                                 1)
                self.storeurl(comments_url, params.originalurl,
                              ChinabyteComments.STEP_2, {
                                  'threadid': threadid[0],
                                  'pageno': 1
                              })

            elif params.step == ChinabyteComments.STEP_2:
                try:
                    threadid = params.customized['threadid']
                    comments = json.loads(params.content)
                    pagetotal = int(comments['cursor']['pages'])
                except:
                    Logger.getlogging().warning('{0}:30000'.format(
                        params.originalurl))
                    return
                #threadid = params.customized['threadid']
                #comments = json.loads(params.content)
                #pagetotal= int(comments['cursor']['pages'])
                # pages==0的场合,没有评论
                if pagetotal == 0:
                    return
                for page in range(1, pagetotal + 1, 1):
                    comments_url = ChinabyteComments.COMMENTS_URL % (threadid,
                                                                     page)
                    self.storeurl(comments_url, params.originalurl,
                                  ChinabyteComments.STEP_3)
            #     comments_url = ChinabyteComments.COMMENTS_URL % (params.customized['threadid'],params.customized['pageno'])
            #     self.storeurl(comments_url, params.originalurl, ChinabyteComments.STEP_3,
            #                   {'threadid':params.customized['threadid'],
            #                    'pageno':params.customized['pageno'],
            #                    'totalpage':pagetotal})
            #
            elif params.step == ChinabyteComments.STEP_3:
                comments = []
                commentinfo = json.loads(params.content)
                for key in commentinfo['parentPosts'].keys():
                    updatetime = getuniformtime(
                        commentinfo['parentPosts'][key]['created_at'])
                    if URLStorage.storeupdatetime(params.originalurl,
                                                  updatetime):
                        cmti = CommentInfo()
                        cmti.content = commentinfo['parentPosts'][key][
                            'message']
                        comments.append(cmti)
                if len(comments) > 0:
                    self.commentstorage.store(params.originalurl, comments)
        except:
            Logger.printexception()
    def geturlcomments(self, params, startpos=0):
        # 取得所有评论
        soup = BeautifulSoup(params.content, 'html5lib')
        comments = soup.select('.info')
        commentTimes = soup.select('.date')
        commentsInfo = []

        # //*[contains(@id,"postmessage_")]
        if len(comments) <= 0:
            tds = soup.select(
                'td.plc')  # soup.find_all("td", attrs={"class": "plc"})
            if tds is None:
                return
            for td in tds:
                timestr = td.find(attrs={'id': re.compile('authorposton')})
                if not timestr:
                    continue
                commentTimes = getuniformtime(timestr.get_text())
                if URLStorage.storeupdatetime(params.originalurl,
                                              commentTimes):
                    contents = td.find(
                        attrs={'id': re.compile('postmessage_')})
                    if contents:
                        cmti = CommentInfo()
                        cmti.content = contents.get_text()
                        commentsInfo.append(cmti)

        else:
            # 取得所有评论
            for index in range(startpos, int(len(comments)), 1):
                # 提取时间
                cmti = CommentInfo()
                publicTime = getuniformtime(
                    commentTimes[index].get_text()).strip()
                #publicTime = self.r.parse(ur'发表于(.*)', publicTime)[0].strip()
                tm = TimeUtility.getuniformtime(
                    TimeUtility.getuniformtime(publicTime, u'%Y-%m-%d %H:%M'))
                if URLStorage.storeupdatetime(params.originalurl, tm):
                    cmti.content = comments[index].get_text()
                    commentsInfo.append(cmti)

        if len(commentsInfo) > 0:
            self.commentstorage.store(params.originalurl, commentsInfo)
    def step3(self, params):
        jsondata = json.loads(params.content)
        for comment in jsondata['data']:

            content = comment['content']
            commentid = comment['id']
            curtime = getuniformtime(comment['comment_time'])
            nick = "none"
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)
 def  process(self,params):
     if params.step == S2Query.STEP_1:
         html=etree.HTML(params.content)
         #try:
             #quit=html.xpath['//div[@id="results"]/text()']
             #totalpage='0'
         #except:
             #totalpage=html.xpath('//div[@class="page"]/span/text()')[0]
             #totalpage= totalpage.split("/")[-1]
             #totalpage=re.sub("\D", "",totalpage)
         results = html.xpath('//*[@id="results"]')
         if not results:
             return
         totalpage=html.xpath('//*[@id="div_3"]/*[@class="page"]/span/text()')
         if totalpage:
             totalpage = self.r.parse('(\d+)',totalpage[0].split('/')[-1])[0]
         else:
             Logger.getlogging().info("there are no results you want!")
             return
             
         urllist=[]
         if int(totalpage) >= self.maxpages:
             totalpage = self.maxpages
         if totalpage <>'0':
             for pages in range(0,int(totalpage)):
                 searchurl = S2Query.S2_URL % (pages+1,params.customized['key'])
                 urllist.append(searchurl)
                 self.__storeqeuryurllist__(urllist, S2Query.STEP_2,{'key':params.customized['key']})
         else:
             return
     elif params.step == S2Query.STEP_2:
         comquerkey=Common.urldec(params.customized['key']).decode('gbk').encode('utf-8')
         soup = BeautifulSoup(params.content,'html5lib')
         urllist = []
         divs = soup.find_all(attrs={'class':'result f s0'})
         if not divs:
             return
         for div in divs:
             title = div.select_one('h3.c-title').get_text()
             title = ''.join(title.strip().split())
             url_tm = div.select_one('.c-showurl').get_text()
             
             tm = getuniformtime(url_tm.split('/')[-1])
             url = 'http://'+'/'.join(url_tm.split('/')[0:-1])
             Logger.getlogging().debug(title)
             #Logger.getlogging().debug(url_tm)
             if not Common.checktitle(comquerkey, title):
                 Logger.getlogging().warning('{url}:40000 out of range, the title!'.format(url=params.originalurl))
                 continue
             if not compareNow(tm, self.querylastdays):
                 Logger.getlogging().warning('{url}:40000 out of range, the time!'.format(url=params.originalurl))
                 continue
             urllist.append(url)
         self.__storeurllist__(urllist,SPIDER_S2_WEBSITE_VIDEO)
    def step2(self, params):
        """获取评论,和评论的评论url"""
        item = params.customized['item']
        artId = params.customized['artId']
        page = params.customized['page']

        #step1.先定位到atl-main/atl-item,取其中的主评论正文,时间,replyid,子评论数
        #step2.通过 子评论数/10 获取自评论页数pageNum
        #step3.通过 merNum,rellyid,pageNum拼出子评论id

        soup = BeautifulSoup(params.content, 'html5lib')
        alt_items = soup.select('.atl-main > .atl-item')
        #print 'alt_items:',len(alt_items)

        if page == 1:
            alt_items = alt_items[1:]

        for alt_item in alt_items:
            curtime = alt_item.select('.atl-head > div.atl-info > span')
            curtime = getuniformtime(curtime[-1].get_text())

            main_comment = alt_item.select_one('.bbs-content').get_text()
            replyid = alt_item.select_one('a[class="reportme a-link"]').get(
                'replyid')

            content = main_comment.strip()
            commentid = replyid
            nick = alt_item.select_one('a[class="js-vip-check"]').get_text()
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)

            child_comment_num = alt_item.select_one(
                'a[class="a-link-2 ir-remark"]').get_text()
            if self.r.search('\d+', child_comment_num):
                child_comment_num = self.r.parse('\d+', child_comment_num)[0]
            else:
                child_comment_num = 0
                continue
            pageNum = int(math.ceil(float(child_comment_num) / self.page_size))
            for page in range(1, int(pageNum) + 1):
                child_url = self.COMMENTS_CHILD_URL.format(item=item,
                                                           artId=artId,
                                                           replyId=replyid,
                                                           page=page)
                #print 'child_url:',child_url
                self.storeurl(child_url, params.originalurl,
                              self.STEP_COMMENT_CHILD_PAGE, {
                                  'item': item,
                                  'artId': artId
                              })
 def getcontents(self,proparam):
     # 取得评论的正则表达式
     comments = re.findall(r'<p>(.+?)<\\/p>', proparam.content)
     ctime = re.findall(r'<span class=\\"date\\">(.+?)<\\/span>', proparam.content)
     nicks = re.findall(r'class=\\"author-name\\">(.+?)<\\/a>', proparam.content)
     # 取得评论
     index = 0
     for index in range(0, len(comments)):
         time = getuniformtime(eval('u"' + ctime[index] + '"'))
         curtime = TimeUtility.getuniformtime(ctime)
         content = eval('u"' + comments[index] + '"').encode('utf-8')
         nick = eval('u"' + nicks[index] + '"').encode('utf-8')
         if not CMTStorage.exist(proparam.originalurl, content, curtime, nick):
             CMTStorage.storecmt(proparam.originalurl, content, curtime, nick)
    def step3bbs(self, params):
        Logger.getlogging().info("Ea3wcomments.STEP_3")
        # Step3: 通过Step2设置的url,得到所有评论,抽取评论
        xparser = XPathUtility(params.content)
        commentsinfo = xparser.getcomments('//p[@class="comment-content"]')
        commentstime = xparser.getcomments('//span[@class="time"]')
        comments = []

        # 获取评论
        for index in range(0, int(len(commentsinfo)), 1):
            # 提取时间
            cmti = CommentInfo()
            cmti.content = commentsinfo[index]

            if str(commentstime[index]).strip().decode("utf8") == '刚刚'.decode(
                    "utf8"):
                tm = getuniformtime(str(datetime.datetime.now()))
            else:
                tm = getuniformtime(str(commentstime[index]))
            if URLStorage.storeupdatetime(params.originalurl, tm):
                comments.append(cmti)
        # 保存获取到的评论
        if len(comments) > 0:
            self.commentstorage.store(params.originalurl, comments)
Beispiel #18
0
 def step3(self,params):
     """"""
     soup = BeautifulSoup(params.content,'html5lib')
     items = soup.select('.commertItem')
     comments = []
     for item in items:
         tm = item.select_one('.comment-time').get_text()
         updatetime = getuniformtime(tm)
         comment = item.select_one('.recTxt').get_text()
         if URLStorage.storeupdatetime(params.originalurl, updatetime):
             cmti = CommentInfo()
             cmti.content = comment
             comments.append(cmti)      
     if len(comments) > 0:
         self.commentstorage.store(params.originalurl, comments)            
 def getanswers(self, params):
     """"""
     soup = BeautifulSoup(params.content, 'html5lib')
     answers = soup.select('.answer-wrapper > .answer-item')
     # comments = []
     for answer in answers:
         tm = answer.select_one('.user').get_text()
         curtime = getuniformtime(tm)
         # lasttime = CMTStorage.getlastpublish(params.originalurl,True)
         # 通过评论最新时间来判断增量
         # if curtime > lasttime:
         content = answer.select_one('.content').get_text()
         nick = "none"
         if not CMTStorage.exist(params.originalurl, content, curtime,
                                 nick):
             CMTStorage.storecmt(params.originalurl, content, curtime, nick)
 def step3(self, params):
     """获取新闻类的url列表"""
     key = Common.urldec(params.customized['query'])
     soup = BeautifulSoup(params.content, 'html5lib')
     lis = soup.select('.wzlist > ul > li.wztitle')
     if lis:
         urllist = []
         for li in lis:
             title = li.select_one('a').get_text()
             # if key not in title:
             if not Common.checktitle(key, title):
                 continue
             pubtime = li.select_one('span').get_text()
             url = 'http://www.52tian.net' + li.select_one('a').get('href')
             if compareNow(getuniformtime(pubtime), self.querylastdays):
                 urllist.append(url)
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Beispiel #21
0
    def step5bbs(self, params):
        # Step3: 通过Step2设置的url,得到所有评论,抽取评论
        soup = BeautifulSoup(params.content, 'html.parser')
        commentsinfo = soup.select('.cpl_nrr2')
        commentstime = soup.select('.cpl_nrr1')
        comments = []

        # 获取评论
        for index in range(0, int(len(commentsinfo) - 1), 1):
            # 提取时间
            cmti = CommentInfo()
            cmti.content = commentsinfo[index].get_text()
            publicTime = self.r.parse(
                ur'发表于 (.*)', commentstime[index].get_text().strip())[0]
            publicTime = getuniformtime(publicTime)
            if URLStorage.storeupdatetime(params.originalurl, publicTime):
                comments.append(cmti)
        # 保存获取到的评论
        if len(comments) > 0:
            self.commentstorage.store(params.originalurl, comments)
 def setpubtime(self, params):
     newtime = None
     if re.search('http://chanye\.18183\.com/.*', params.url):
         Xhtml = XPathUtility(params.content)
         timestr = Xhtml.getstring(
             '//*[@class="arc-other"]/span[3]|//*[@class="other"]/span[3]')
         if not timestr:
             return
         p = '(\d{2}-\d+-\d+)'
         if re.search(p, timestr):
             new = str(time.localtime()[0])[0:2] + re.findall(p, timestr)[0]
             newtime = getuniformtime(new)
     #if re.search('http://bbs\.18183\.com/.*',params.url):
     #Xhtml = XPathUtility(params.content)
     #timestr = Xhtml.getstring('//*[@class="authi"]/em')
     #if not timestr:
     #return
     #times = timestr.split(u'发表于')[1]
     #newtime = TimeUtility.getuniformtime(times)
     if newtime:
         NewsStorage.setpublishdate(params.originalurl, newtime)
    def getkurlcomments(self, params):
        xparser = XPathUtility(params.content)
        # 获取评论列表
        comments_xpath = xparser.xpath('//*[@class="page-pl-list-text"]')
        # 获取评论时间
        pubtime_xpath = xparser.xpath('//*[@class="page-pl-user-timer"]')

        if len(comments_xpath) >= len(pubtime_xpath):
            start = len(comments_xpath) - len(pubtime_xpath)
            comments = []
            for index in range(start, len(comments_xpath), 1):
                if URLStorage.storeupdatetime(
                        params.originalurl,
                        getuniformtime(pubtime_xpath[index].text)):
                    cmti = CommentInfo()
                    cmti.content = comments_xpath[index].text
                    comments.append(cmti)

            # 保存获取到的评论
            if len(comments) > 0:
                self.commentstorage.store(params.originalurl, comments)
 def step2(self, params):
     try:
         soup = BeautifulSoup(params.content, 'html5lib')
         items = soup.select('.List > div > .List-item')
         if not items:
             return
         for item in items:
             times = item.select_one('.ContentItem-time').get_text()
             content = item.find(
                 attrs={
                     "class": "RichText CopyrightRichText-richText"
                 }).get_text()
             curtime = getuniformtime(times)
             nick = item.select_one(
                 '.ContentItem-meta > .AnswerItem-meta > .AuthorInfo'
             ).get_text()
             if not CMTStorage.exist(params.originalurl, content, curtime,
                                     nick):
                 CMTStorage.storecmt(params.originalurl, content, curtime,
                                     nick)
     except:
         Logger.printexception()
 def step2(self, params):
     """"""
     info = params.customized['query']
     info = Common.urldec(info)
     soup = BeautifulSoup(params.content, 'html5lib')
     videos = soup.select('.uiVideo > .uiVideo__item')
     if videos:
         urllist = []
         for video in videos:
             title = video.select_one('h3 > a').get('title')
             pubtime = video.select('.result__data > span')[-1].get_text()
             url = video.select_one('h3 > a').get('href')
             # if not info in title:
             if compareNow(getuniformtime(pubtime), self.querylastdays):
                 if Common.checktitle(info, title):
                     urllist.append(url)
                 else:
                     Logger.log(url,
                                constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Beispiel #26
0
    def getsearchresult(self, params):
        info = params.customized['query']
        soup = BeautifulSoup(params.content, 'html5lib')
        lis = soup.select('ul.ckl_cktpp > li.cfix')
        urllist = []
        if lis:
            for li in lis:
                title = li.select_one('h3').get_text()
                # if info not in title:
                if not Common.checktitle(info, title):
                    continue
                times = li.select('p')[-2].get_text()
                times = getuniformtime(times)
                url = li.select_one('h3 > a').get('href')
                if compareNow(times, self.querylastdays):
                    urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

            return len(urllist)
        else:
            return -1
 def step2(self, params):
     """"""
     try:
         key = params.customized['key']
         key = Common.urldec(key)
         soup = BeautifulSoup(params.content, 'html5lib')
         books = soup.select('#searchResult > .book')
         if books:
             urllist = []
             for book in books:
                 title = book.select_one('h3 > a').get_text()
                 if key not in title:
                     continue
                 pubtime = book.select('.w_auth')[-2].get_text()
                 url = book.select_one('h3 > a').get('href')
                 if compareNow(getuniformtime(pubtime), self.querylastdays):
                     urllist.append(url)
             self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS)
     except:
         Logger.printexception()
         Logger.getlogging().error(
             'extract comment error from {site}'.format(site=params.url))
    def geturlcomments(self, params):
        # 获取具体评论
        xparser = XPathUtility(params.content)
        comments_xpath = xparser.xpath('//*[contains(@id, "cm_")]')
        if not comments_xpath:
            return

        # 获取发布时间
        ip_pubtimes_xpath = xparser.getlist('//*[contains(@id,"CList___CommentList_UserLink_")]/..')

        if len(comments_xpath) == len(ip_pubtimes_xpath):
            comments = []
            # 获取评论
            for index in range(0, len(comments_xpath), 1):
                cmti = CommentInfo()
                if URLStorage.storeupdatetime(params.originalurl, getuniformtime(ip_pubtimes_xpath[index])):
                    # 获取增加的评论(根据时间比较)
                    cmti.content = comments_xpath[index].text
                    comments.append(cmti)

            # 保存获取的评论
            if len(comments) > 0:
                self.commentstorage.store(params.originalurl, comments)
 def get_comment_reply_step3(self, params):
     """"""
     try:
         jsondata = json.loads(params.content)
         data = jsondata['data']
         if data:
             comment_list = data['comment_list']
             for comment_id in comment_list:
                 try:
                     comments = comment_list[comment_id]
                     comment_list_num = comments['comment_list_num']
                     if int(comment_list_num) <= 0:
                         continue
                     for info in comments['comment_info']:
                         curtime = getuniformtime(str(info['now_time']))
                         content = info['content']
                         if not CMTStorage.exist(params.originalurl,
                                                 content, curtime, 'nick'):
                             CMTStorage.storecmt(params.originalurl,
                                                 content, curtime, 'nick')
                 except:
                     Logger.printexception()
     except:
         Logger.printexception()
Beispiel #30
0
    def process(self, params):
        try:
            if params.step is AllComments.STEP_1:
                comments_url = AllComments.SID_URL % (params.originalurl)
                self.storeurl(comments_url, params.originalurl,
                              AllComments.STEP_2)

            elif params.step is AllComments.STEP_2:
                try:
                    threadid = self.r.parse('"sid":(.*?),"', params.content)[0]
                    curcmtnum = int(
                        self.r.parse('"postcount":(.*?),"', params.content)[0])
                    NewsStorage.setcmtnum(params.originalurl, curcmtnum)
                    dbcmtnum = CMTStorage.getcount(params.originalurl, True)
                    if dbcmtnum >= curcmtnum:
                        return
                    pagenum = int(math.ceil(float(curcmtnum - dbcmtnum) / 3))
                    for page in range(0, pagenum, 1):
                        comments_url = AllComments.COMMENTS_URL % (threadid,
                                                                   page)
                        self.storeurl(comments_url, params.originalurl,
                                      AllComments.STEP_3)
                except:
                    Logger.printexception()

            elif params.step is AllComments.STEP_3:
                # 获取评论的Jason返回值
                commentsinfo = json.loads(params.content)
                comments = []
                for comment in commentsinfo['data']:
                    pubdate = getuniformtime(comment['time'])
                    content = comment['cnt']
                    CMTStorage.storecmt(params.originalurl, content, pubdate,
                                        '')
        except:
            Logger.printexception()