Python compareNow Exemples, utility.gettimeutil.compareNow Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : one7173query.py Projet : ErBingBing/django-tonado-crawler

 def step2(self, params):
     """"""
     q = params.customized['query']
     soup = BeautifulSoup(params.content, 'html5lib')
     divs = soup.select('.videobox')
     if not divs:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     urllist = []
     for div in divs:
         title = div.select_one('.title').get_text()
         #print title
         tm = getuniformtime(div.select_one('.date').get_text())
         url = div.select_one('.title > a').get('href')
         Logger.getlogging().debug(title)
         if not compareNow(tm, self.querylastdays):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
             continue
         if not Common.checktitle(Common.urldec(q), title):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             continue
         urllist.append(url)
         #获取最终url列表
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

Exemple #2

0

Afficher le fichier

 def step2(self,params):
     soup = BeautifulSoup(params.content, 'html5lib')
     if soup.find(attrs={"id":re.compile('noresult_part._container')}):
         Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url))
         return 
     results = soup.select('.results > .vrwrap')
     if not results:
         Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url))
         return 
     urllist = []
     newurllist = []
     for item in results:
         try:
             if not item.select_one('h3.vrTitle > a'):
                 continue
             title = item.select_one('h3.vrTitle > a').get_text()
             href = item.select_one('h3.vrTitle > a').get('href')
             timestr = item.select_one('.news-detail > .news-info > .news-from').get_text()
             times = getuniformtime(timestr)
             Logger.getlogging().debug('title:'+ title)
             Logger.getlogging().debug('time:'+ times)
             if compareNow(times, self.querylastdays):
                 Logger.getlogging().debug('href:'+ href)
                 urllist.append(href)
             newitem = item.select_one('#news_similar')
             if newitem:
                 newhref = 'http://news.sogou.com/news'+newitem.get('href')
                 Logger.getlogging().debug('newhref:'+ newhref)
                 newurllist.append(newhref)
         except:
             Logger.printexception()
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS)      
     if len(newurllist) > 0:
         self.__storeqeuryurllist__(newurllist, self.NEWS_EACH_2)

Exemple #3

0

Afficher le fichier

    def step2(self, params):
        """"""
        try:
            key = params.customized['key']
            soup = BeautifulSoup(params.content, 'html5lib')
            #print soup
            #searchListOne = soup.select('.searchListOne > ul')
            searchListOne = soup.select('.searchListOne > ul > li > div')
            if not searchListOne:
                Logger.getlogging().warning('{}:40000 No urllist'.format(
                    params.originalurl))
                return
            lis = soup.select(
                '.searchListOne > ul > li'
            )[:-1]  #最后一个<li id=search_msg style="display:none"></li>，过滤掉
            urllist = []
            for li in lis:
                url = li.select_one('h3 > a').get('href')
                #print '*********',url
                tm = li.select('.source > span')[0].get_text()
                tm = getuniformtime(tm)
                now = getuniformtime(str(time.time()))
                cmt_num = li.select('.source > span')[-1].get_text()

                title = li.select_one('h3').get_text()
                if Common.checktitle(Common.urldec(key), title):
                    if compareNow(tm, self.querylastdays):
                        urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
        except:
            #traceback.print_exc()
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))

Exemple #4

0

Afficher le fichier

Fichier : renrenquery.py Projet : ErBingBing/django-tonado-crawler

 def  process(self,params):
     if params.step == S2Query.STEP_1:
         html=etree.HTML(params.content)
         #try:
             #quit=html.xpath['//div[@id="results"]/text()']
             #totalpage='0'
         #except:
             #totalpage=html.xpath('//div[@class="page"]/span/text()')[0]
             #totalpage= totalpage.split("/")[-1]
             #totalpage=re.sub("\D", "",totalpage)
         results = html.xpath('//*[@id="results"]')
         if not results:
             return
         totalpage=html.xpath('//*[@id="div_3"]/*[@class="page"]/span/text()')
         if totalpage:
             totalpage = self.r.parse('(\d+)',totalpage[0].split('/')[-1])[0]
         else:
             Logger.getlogging().info("there are no results you want!")
             return
             
         urllist=[]
         if int(totalpage) >= self.maxpages:
             totalpage = self.maxpages
         if totalpage <>'0':
             for pages in range(0,int(totalpage)):
                 searchurl = S2Query.S2_URL % (pages+1,params.customized['key'])
                 urllist.append(searchurl)
                 self.__storeqeuryurllist__(urllist, S2Query.STEP_2,{'key':params.customized['key']})
         else:
             return
     elif params.step == S2Query.STEP_2:
         comquerkey=Common.urldec(params.customized['key']).decode('gbk').encode('utf-8')
         soup = BeautifulSoup(params.content,'html5lib')
         urllist = []
         divs = soup.find_all(attrs={'class':'result f s0'})
         if not divs:
             return
         for div in divs:
             title = div.select_one('h3.c-title').get_text()
             title = ''.join(title.strip().split())
             url_tm = div.select_one('.c-showurl').get_text()
             
             tm = getuniformtime(url_tm.split('/')[-1])
             url = 'http://'+'/'.join(url_tm.split('/')[0:-1])
             Logger.getlogging().debug(title)
             #Logger.getlogging().debug(url_tm)
             if not Common.checktitle(comquerkey, title):
                 Logger.getlogging().warning('{url}:40000 out of range, the title!'.format(url=params.originalurl))
                 continue
             if not compareNow(tm, self.querylastdays):
                 Logger.getlogging().warning('{url}:40000 out of range, the time!'.format(url=params.originalurl))
                 continue
             urllist.append(url)
         self.__storeurllist__(urllist,SPIDER_S2_WEBSITE_VIDEO)

Exemple #5

0

Afficher le fichier

Fichier : tian52query.py Projet : ErBingBing/django-tonado-crawler

 def step3(self, params):
     """获取新闻类的url列表"""
     key = Common.urldec(params.customized['query'])
     soup = BeautifulSoup(params.content, 'html5lib')
     lis = soup.select('.wzlist > ul > li.wztitle')
     if lis:
         urllist = []
         for li in lis:
             title = li.select_one('a').get_text()
             # if key not in title:
             if not Common.checktitle(key, title):
                 continue
             pubtime = li.select_one('span').get_text()
             url = 'http://www.52tian.net' + li.select_one('a').get('href')
             if compareNow(getuniformtime(pubtime), self.querylastdays):
                 urllist.append(url)
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

Exemple #6

0

Afficher le fichier

Fichier : baidutiebaquery.py Projet : ErBingBing/django-tonado-crawler

 def baidutiebasearch_step3(self, params):
     content = ''
     p = '<!--[\s\S]{0,}(<ul id="thread_list".*[\s\S]{0,})--></code><script>'
     if re.search(p, params.content):
         content = re.findall(p, params.content)[0]
     if not content:
         Logger.log(params.url, constant.ERRORCODE_WARNNING_NORESULTS)
         return
     soup = BeautifulSoup(content, 'html5lib')
     #print soup
     top_list = soup.select('#thread_top_list > li.j_thread_list')
     thread_list = soup.select('#thread_list > li.j_thread_list')
     urllist = []
     for item in top_list + thread_list:
         #print item
         try:
             pubtimeobj = item.find(attrs={
                 'class':
                 'threadlist_reply_date pull_right j_reply_data'
             })
             if not pubtimeobj:
                 pubtimeobj = item.find(
                     attrs={'class': 'pull-right is_show_create_time'})
             pubtime = pubtimeobj.get_text().strip().replace(' ', '')
             href = item.select_one('.threadlist_title > a').get('href')
             title = item.select_one('.threadlist_title > a').get('title')
             Logger.getlogging().debug(title)
             Logger.getlogging().debug(pubtime)
             pubtime = self.getuniformtime(pubtime)
             Logger.getlogging().debug(pubtime)
             if self.isyestoday(pubtime):
                 pubtime2obj = item.find(
                     attrs={'class': 'pull-right is_show_create_time'})
                 if pubtime2obj:
                     pubtime2 = self.getuniformtime(pubtime2obj.get_text())
                     if not gettimeutil.compareNow(pubtime2,
                                                   self.querylastdays):
                         continue
                 Logger.getlogging().debug('https://tieba.baidu.com' + href)
                 urllist.append('https://tieba.baidu.com' + href)
         except:
             Logger.printexception()
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)

Exemple #7

0

Afficher le fichier

Fichier : duowanquery.py Projet : ErBingBing/django-tonado-crawler

 def step2(self, params):
     """"""
     info = params.customized['query']
     info = Common.urldec(info)
     soup = BeautifulSoup(params.content, 'html5lib')
     videos = soup.select('.uiVideo > .uiVideo__item')
     if videos:
         urllist = []
         for video in videos:
             title = video.select_one('h3 > a').get('title')
             pubtime = video.select('.result__data > span')[-1].get_text()
             url = video.select_one('h3 > a').get('href')
             # if not info in title:
             if compareNow(getuniformtime(pubtime), self.querylastdays):
                 if Common.checktitle(info, title):
                     urllist.append(url)
                 else:
                     Logger.log(url,
                                constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

Exemple #8

0

Afficher le fichier

Fichier : ebooktencentquery.py Projet : ErBingBing/django-tonado-crawler

 def step2(self, params):
     """"""
     try:
         key = params.customized['key']
         key = Common.urldec(key)
         soup = BeautifulSoup(params.content, 'html5lib')
         books = soup.select('#searchResult > .book')
         if books:
             urllist = []
             for book in books:
                 title = book.select_one('h3 > a').get_text()
                 if key not in title:
                     continue
                 pubtime = book.select('.w_auth')[-2].get_text()
                 url = book.select_one('h3 > a').get('href')
                 if compareNow(getuniformtime(pubtime), self.querylastdays):
                     urllist.append(url)
             self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS)
     except:
         Logger.printexception()
         Logger.getlogging().error(
             'extract comment error from {site}'.format(site=params.url))

Exemple #9

0

Afficher le fichier

    def getsearchresult(self, params):
        info = params.customized['query']
        soup = BeautifulSoup(params.content, 'html5lib')
        lis = soup.select('ul.ckl_cktpp > li.cfix')
        urllist = []
        if lis:
            for li in lis:
                title = li.select_one('h3').get_text()
                # if info not in title:
                if not Common.checktitle(info, title):
                    continue
                times = li.select('p')[-2].get_text()
                times = getuniformtime(times)
                url = li.select_one('h3 > a').get('href')
                if compareNow(times, self.querylastdays):
                    urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

            return len(urllist)
        else:
            return -1

Exemple #10

0

Afficher le fichier

    def pageprocess(self, params):
        # Step3: 根据Step2的返回jason数据，获取
        # 标题：jsondata['data'][0开始到19]['title']
        # 连接：jsondata['data'][0开始到19]['url']
        # 视频发布时间：jsondata['data'][0开始到19]['modifydatel'] 这个需要截断前10位，只能对比日期

        info = params.customized['query']
        jsondata = json.loads(params.content)
        searchresult = jsondata['data']

        urllist = []
        for result in searchresult:
            title = result['title']
            url = result['url']
            pubtime = result['modifydate']
            # if not info in title:
            if compareNow(getuniformtime(pubtime), self.querylastdays):
                if Common.checktitle(info, title):
                    urllist.append(self.MAIN_DOMAIN + url)
                else:
                    Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
            else:
                Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
        self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

Exemple #11

0

Afficher le fichier

Fichier : xinhuaquery.py Projet : ErBingBing/django-tonado-crawler

 def step2(self, params):
     """"""
     query = params.customized['query']
     soup = BeautifulSoup(params.content, 'html.parser')
     trs = soup.select('#schend')
     if not trs:
         return
     urllist = []
     for tr in trs:
         title = tr.select_one('.sb14b').get_text()
         content = etree.HTML(str(tr))
         publicTimes = content.xpath(
             '//*[@id="schend"]/table[1]/tr/td[3]/text()')[-1].strip()
         href = tr.select_one('.sb14b').get('href')
         id = re.findall('id=(\d+)&', href)[0]
         url = 'http://forum.home.news.cn/detail/' + id + '/1.html'
         if not compareNow(getuniformtime(publicTimes), self.querylastdays):
             continue
         if not Common.checktitle(Common.trydecode(query),
                                  Common.trydecode(title)):
             continue
         urllist.append(url)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)

Exemple #12

0

Afficher le fichier

Fichier : narutomquery.py Projet : ErBingBing/django-tonado-crawler

    def process(self, params):
        if params.step == NarutomS2Query.S2QUERY_FIRST_PAGE:
            #Step2： 根据返回的url,通过xpath://*[@id="results"]/span 得到搜索结果总件数，根据总件数，拼出搜索结果的url，写文件保存
            html = etree.HTML(params.content)
            nodes = html.xpath('//*[@id="results"]/span')
            # 获取不到，则返回
            if len(nodes) == 0:
                return

            # 获取总检索件数（例如：为您找到相关结果1,307个）
            count = 0
            totalstr = nodes[0].text.replace(',', '')
            if self.r.search(u'\d+', totalstr):
                countstr = self.r.parse(u'(\d+)', totalstr)[0]
                count = int(countstr)
                # 该网站最多能获得750件检索结果
                if count > self.MAX_COUNT:
                    count = self.MAX_COUNT
            else:
                return

            # 根据上面的count数，拼出所有的搜索结果url
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            page_count = float(count / self.DEFAULT_PAGE_SIZE)
            firstpage = NarutomS2Query.FIRST_PAGE.format(key=keyvalue)
            querylist = []
            querylist.append(firstpage)
            if count > 10:
                #第二页的page数是1，第三页是2...... page数范围是：1-74(表示第2页-第75页）
                for page in range(1, int(math.ceil(page_count)), 1):
                    url = NarutomS2Query.QUERY_TEMPLATE.format(key=keyvalue,
                                                               pageno=page)
                    querylist.append(url)

            self.__storeqeuryurllist__(querylist,
                                       NarutomS2Query.S2QUERY_EACH_PAGE,
                                       {'query': info})

        elif params.step == NarutomS2Query.S2QUERY_EACH_PAGE:
            # Step3：根据Step2的返回结果，通过xpath: //*[@id="results"]/div/h3/a/@href 获得搜索结果的url,把url写入文件
            info = params.customized['query']
            html = etree.HTML(params.content)
            nodes = html.xpath('//*[@id="results"]/div/h3/a/@href')
            #titles = html.xpath('//*[@id="results"]/div/h3/a')
            pubtimestr = html.xpath('//*[@class="c-showurl"]')

            datecheck = False
            if len(pubtimestr) == len(nodes):
                datecheck = True

            urllist = []
            for index in range(0, len(nodes), 1):
                # if titles[index] is not None and titles[index].find(info) > -1:
                # if titles[index] is not None and Common.checktitle(info, titles[index]):
                # 标题中包含指定要查询的关键字，对应的url保存
                if datecheck:
                    # 如果xpath获取到了包含时间的字符串，检查时间
                    if self.r.search('(\d+-\d+-\d+)', pubtimestr[index].text):
                        pubtime = getuniformtime(
                            self.r.parse('(\d+-\d+-\d+)',
                                         pubtimestr[index].text)[0])
                        if compareNow(pubtime, int(self.querylastdays)):
                            urllist.append(nodes[index])
                else:
                    urllist.append(nodes[index])
            '''
            urllist = []
            for node in nodes:
                urllist.append(node)
            '''
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

Exemple #13

0

Afficher le fichier

    def processVideo(self, params):
        if params.step == MofangS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回json内容，comments['totalnums'] 得到视频总数
            #一个json返回20条数据，需要使用总数除以20获得总页数，然后在写到page参数里面
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            try:
                jsondate = json.loads(params.content)
                comments_count = jsondate['totalnums']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return
            # 获取不到，则返回
            if int(comments_count) == 0:
                return

            page_count = int(
                math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE))
            # 根据上面的page_count数，拼出所有的搜索结果url(最新1周）
            querylist = []
            if page_count > 0:
                for page in range(1, page_count + 1, 1):
                    url = MofangS2Query.QUERY_TEMPLATE.format(
                        key=keyvalue,
                        pageno=page,
                        pagesize=self.DEFAULT_PAGE_SIZE)
                    Logger.getlogging().debug(url)
                    querylist.append(url)
                self.__storeqeuryurllist__(querylist,
                                           MofangS2Query.S2QUERY_EACH_PAGE,
                                           {'query': info})

        elif params.step == MofangS2Query.S2QUERY_EACH_PAGE:
            # Step3: 根据Step2的返回jason数据，获取
            # 标题：comments['data'][0开始到19]['title']
            # 连接：comments['data'][0开始到19]['url']
            # 视频发布时间：comments['data'][0开始到19]['inputtime'] 这个需要截断前10位，只能对比日期

            info = params.customized['query']
            try:
                jsondate = json.loads(params.content)
                searchresult = jsondate['data']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return

            # 获取当前日(日期类型)
            today = datetime.datetime.strptime(TimeUtility.getcurrentdate(),
                                               TimeUtility.DATE_FORMAT_DEFAULT)

            urllist = []
            for index in range(0, len(searchresult), 1):
                #print searchresult[index]['title']
                #print searchresult[index]['inputtime']
                if searchresult[index]['title'] is not None:
                    # 标题中包含指定要查询的关键字，对应的url保存
                    # if searchresult[index]['title'].find(info) > -1:
                    if Common.checktitle(info, searchresult[index]['title']):
                        if searchresult[index]['inputtime'] is not None:
                            #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['inputtime'])), TimeUtility.TIME_FORMAT_DEFAULT)
                            #intervaldays = today - inputtime
                            #if intervaldays.days <= int(self.querylastdays):
                            pubtime = getuniformtime(
                                str(searchresult[index]['inputtime']))

                            if compareNow(pubtime, int(self.querylastdays)):
                                urllist.append(searchresult[index]['url'])
                        else:
                            # 获取不到发布时间，则默认为周期以内
                            urllist.append(searchresult[index]['url'])

            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

Exemple #14

0

Afficher le fichier

Fichier : baidutiebacomments.py Projet : ErBingBing/django-tonado-crawler

 def getpagecomments_step2(self, params):
     try:
         page = params.customized['page']
         soup = BeautifulSoup(params.content, "html5lib")
         d_post_content_main = soup.select('#j_p_postlist > div.j_l_post')
         if page == 1:
             main_item = d_post_content_main[0]
             #print main_item
             pubtimes = ''
             pubtimesobj = main_item.select('.tail-info')
             if pubtimesobj:
                 pubtimes = getuniformtime(
                     pubtimesobj[-1].get_text().strip())
             else:
                 pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+',
                                           str(main_item))
                 if pubtimeslist:
                     pubtimes = getuniformtime(pubtimeslist[0])
             if pubtimes:
                 NewsStorage.setpublishdate(params.originalurl, pubtimes)
                 if not compareNow(pubtimes, self.COMMENT_LIMIT_DAYS):
                     Logger.log(params.originalurl,
                                constant.ERRORCODE_WARNNING_NOMATCHTIME)
                     #超过7天的帖子，不在取回复/评论了
                     return False
             d_post_content_main = d_post_content_main[1:]
         comments = []
         for item in d_post_content_main:
             try:
                 comment = item.find(
                     attrs={'id': re.compile("post_content")})
                 if not comment:
                     continue
                 content = comment.get_text().strip()
                 pubtimes = ''
                 pubtimesobj = item.select('.tail-info')
                 if pubtimesobj:
                     pubtimes = getuniformtime(
                         pubtimesobj[-1].get_text().strip())
                 else:
                     pubtimeslist = re.findall('\d+-\d+-\d+ \d+:\d+',
                                               str(item))
                     if pubtimeslist:
                         pubtimes = getuniformtime(pubtimeslist[0])
                 if not pubtimes:
                     if not CMTStorage.exist(params.originalurl, content,
                                             TimeUtility.getdatebefore(0),
                                             'nick'):
                         CMTStorage.storecmt(params.originalurl, content,
                                             TimeUtility.getdatebefore(0),
                                             'nick')
                     continue
                 #判断评论是否是前一天的
                 Logger.getlogging().debug(pubtimes)
                 if self.isyestoday(pubtimes):
                     if not CMTStorage.exist(params.originalurl, content,
                                             pubtimes, 'nick'):
                         CMTStorage.storecmt(params.originalurl, content,
                                             pubtimes, 'nick')
             except:
                 Logger.printexception()
         return True
     except:
         Logger.printexception()
         return False

Exemple #15

0

Afficher le fichier

    def process(self, params):
        if params.step == HuyaS2Query.S2QUERY_FIRST_PAGE:
            Logger.getlogging().debug("HuyaS2Query.S2QUERY_FIRST_PAGE")
            #Step2： 根据返回的html,通过xpath://*[@id="tab1"]/div[1]/div/span/em 得到搜索结果总件数
            # 根据总件数，计算出page总数（总件数/20件，除不尽向上取整）拼出搜索结果的url，写文件保存
            soup = BeautifulSoup(params.content, 'html5lib')
            if soup.select('.search-no-data-wrap'):
                return
            # 获取不到，则返回
            totalstr = soup.select_one('.search-list > .mod-tab-hd > .act')
            if not totalstr:
                return
            # 获取总检索页数（例如：160）
            totalstr = totalstr.get_text().replace(',', '')
            count = int(re.findall('\d+', totalstr)[0])

            # 根据上面的count数，拼出所有的搜索结果url
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            querylist = []
            pagecount = float(count) / self.DEFAULT_PAGE_SIZE
            pages = int(math.ceil(pagecount))
            if pages >= self.maxpages:
                pages = self.maxpages
            for page in range(1, pages + 1, 1):
                url = HuyaS2Query.QUERY_TEMPLATE.format(pageno=page,
                                                        key=keyvalue)
                Logger.getlogging().debug(url)
                querylist.append(url)
            self.__storeqeuryurllist__(querylist,
                                       HuyaS2Query.S2QUERY_EACH_PAGE,
                                       {'query': info})

        elif params.step == HuyaS2Query.S2QUERY_EACH_PAGE:
            info = params.customized['query']
            soup = BeautifulSoup(params.content, 'html5lib')
            if soup.select('.search-no-data-wrap'):
                return
            divs = soup.select('ul.video-list')
            if divs:
                divs = divs[-1]
                divs = divs.select('li')
            if not divs:
                return

            urllist = []
            for div in divs:
                video = div.select_one('.video-title > .video-wrap')
                timestr = div.select_one('.result-data')
                times = getuniformtime(timestr.get_text())
                titles = video.get('title')
                url = video.get('href')
                if compareNow(times, self.querylastdays) and Common.checktitle(
                        info, titles):
                    Logger.getlogging().debug(titles)
                    Logger.getlogging().debug(url)
                    urllist.append(url)
                else:
                    Logger.getlogging().debug(
                        titles + ' not match title or out of time')
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

Exemple #16

0

Afficher le fichier

Fichier : cine107query.py Projet : ErBingBing/django-tonado-crawler

    def process(self, params):
        if params.step == Cine107S2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回内容，得到搜索结果总数
            info = params.customized['query']
            soup = BeautifulSoup(params.content, 'html5lib')
            # 搜素结果
            #print soup
            results = soup.select('#results')[0]
            # 无查找结果，则返回
            if results.get_text().find('抱歉') > -1:
                return
            else:
                resultStr = results.select('span.support-text-top')[0].get_text().strip()
                resultStr = resultStr[8:resultStr.index('个')]
                if resultStr.find(',') > -1:
                    result_counts = int(resultStr.replace(',', ''))
                else:
                    result_counts = int(resultStr)
                    Logger.getlogging().debug(result_counts)
                #搜索结果只能查看75[0:74]页，如果超过75页，按照75页处理
                if result_counts > 750:
                    result_counts = 750

            #计算出循环页数page_count
            if result_counts < 10:
                page_count = 0
            else:
                page_count = int(math.ceil(result_counts / Cine107S2Query.DEFAULT_PAGE_SIZE))
            # 根据上面的page_count数，拼出所有的搜索结果url
            querylist = []
            for page in range(0, page_count):
                url = Cine107S2Query.QUERY_TEMPLATE.format(key=info, pageno=page)
                querylist.append(url)
            self.__storeqeuryurllist__(querylist, Cine107S2Query.S2QUERY_EACH_PAGE, {'query': info})

        elif params.step == Cine107S2Query.S2QUERY_EACH_PAGE:
            # Step3: 根据Step2的url，获取搜索结果的url，把url写入文件
            info = params.customized['query']
            soup = BeautifulSoup(params.content, 'html.parser')
            titles = soup.select('h3.c-title')
            times = soup.select('span.c-showurl')
            urllist = []
            index = 0
            for result in titles[0:]:
                title = result.get_text().strip()
                nodeUrl = result.select('a')[0].get('href')
                timeStr = times[index].get_text().strip()
               
                if timeStr.find('html') > -1:
                    timeStr = timeStr[timeStr.index('html') + 5:]
                elif timeStr.find('...') > -1:
                    timeStr = timeStr[timeStr.index('...')+4:]
                # 标题中包含指定要查询的关键字，并且是一周内的帖子，对应的url保存
                if self.r.search(ur'(\d+-\d+-\d+)', timeStr):
                    timeStr = self.r.parse(ur'(\d+-\d+-\d+)', timeStr)[0]
                #if title.find(Common.urldec(info)) > -1 and TimeUtility.getuniformtime(timeStr, '%Y-%m-%d') > TimeUtility.getuniformdatebefore(7):
                    #urllist.append(nodeUrl)
                #Logger.getlogging().debug(title)
                #Logger.getlogging().debug(nodeUrl)  
                #Logger.getlogging().debug(Common.urldec(info))
                #Logger.getlogging().debug(title)
                if  compareNow(getuniformtime(timeStr),self.querylastdays):
                    if Common.checktitle(Common.urldec(info), title):
                        urllist.append(nodeUrl) 
                index += 1
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)