def step2(self, params):
     """"""
     q = params.customized['query']
     soup = BeautifulSoup(params.content, 'html5lib')
     divs = soup.select('.videobox')
     if not divs:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     urllist = []
     for div in divs:
         title = div.select_one('.title').get_text()
         #print title
         tm = getuniformtime(div.select_one('.date').get_text())
         url = div.select_one('.title > a').get('href')
         Logger.getlogging().debug(title)
         if not compareNow(tm, self.querylastdays):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
             continue
         if not Common.checktitle(Common.urldec(q), title):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             continue
         urllist.append(url)
         #获取最终url列表
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 2
0
    def step2(self, params):
        """"""
        try:
            key = params.customized['key']
            soup = BeautifulSoup(params.content, 'html5lib')
            #print soup
            #searchListOne = soup.select('.searchListOne > ul')
            searchListOne = soup.select('.searchListOne > ul > li > div')
            if not searchListOne:
                Logger.getlogging().warning('{}:40000 No urllist'.format(
                    params.originalurl))
                return
            lis = soup.select(
                '.searchListOne > ul > li'
            )[:-1]  #最后一个<li id=search_msg style="display:none"></li>,过滤掉
            urllist = []
            for li in lis:
                url = li.select_one('h3 > a').get('href')
                #print '*********',url
                tm = li.select('.source > span')[0].get_text()
                tm = getuniformtime(tm)
                now = getuniformtime(str(time.time()))
                cmt_num = li.select('.source > span')[-1].get_text()

                title = li.select_one('h3').get_text()
                if Common.checktitle(Common.urldec(key), title):
                    if compareNow(tm, self.querylastdays):
                        urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
        except:
            #traceback.print_exc()
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))
 def pageprocess(self, params):
     # Step3:根据返回的html,通过xpath://*[@class="scout_anim_titletext"],获得检索结果的标题
     # //*[@class="scout_anim_title"]/div/a/@href,获得检索结果的url
     #Logger.getlogging().debug(params.content)
     indexstart = params.content.find('(')
     indexstop = params.content.rfind(')')
     if indexstart > -1 and indexstop > -1:
         jsonvalue = params.content[indexstart + 1:indexstop]
         jsondata = json.loads(jsonvalue)
         info = params.customized['query']
         soup = BeautifulSoup(jsondata['content'], 'html5lib')
         uls = soup.select('.scout_anim_odd > .scout_anim_odd_ul')
         if uls:
             for ul in uls:
                 #titles = ul.select_one('.scout_anim_titletext')
                 titles = ul.select_one('.scout_anim_titletext').get_text()
                 Logger.getlogging().debug(titles)
                 # if info not in titles:
                 if not Common.checktitle(info, titles):
                     return
                 content = ul.select('.scout_anim_content > div > ul > li')
                 if content:
                     if len(content) > 3:
                         content = content[-3:]
                     urllist = [
                         'https://donghua.dmzj.com' +
                         item.find('a').get('href') for item in content
                     ]
                     self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def getsearchresult(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//li/h3/a/@href')
        titles = xpath.getlist('//li/h3/a')
        pubtimes = xpath.xpath('//li/p')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text)
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= self.querylastdays:
                    urllist.append(hrefs[index])
                else:
                    # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。
                    break

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 5
0
    def step2(self, params):
        info = Common.urldec(params.customized['info'])
        soup = BeautifulSoup(params.content, 'html5lib')
        text_divs = soup.select('.s_r_txt')
        urllist = []

        if text_divs:
            for item in text_divs:
                title = item.select_one('h3 > a').get_text()
                url = item.select_one('h3 > a').get('href')
                curtime = item.select('p')[-1].get_text().strip()
                try:
                    if TimeUtility.compareNow(
                            TimeUtility.getuniformtime(curtime),
                            self.querylastdays):
                        if Common.checktitle(info, title):
                            urllist.append(url)
                        else:
                            Logger.log(
                                url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                    else:
                        Logger.log(url,
                                   constant.ERRORCODE_WARNNING_NOMATCHTIME)
                except:
                    urllist.append(url)
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 6
0
    def getpagecomments(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href')
        titles = xpath.getlist('//*[@class="sosResult"]/strong/a')
        pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformtime(
                    pubtimes[index].text).split(' ')[0]
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT)
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= int(self.querylastdays):
                    newurl = self.preprocess(hrefs[index])
                    if newurl is not None:
                        urllist.append(newurl)

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
 def  process(self,params):
     if params.step == S2Query.STEP_1:
         html=etree.HTML(params.content)
         #try:
             #quit=html.xpath['//div[@id="results"]/text()']
             #totalpage='0'
         #except:
             #totalpage=html.xpath('//div[@class="page"]/span/text()')[0]
             #totalpage= totalpage.split("/")[-1]
             #totalpage=re.sub("\D", "",totalpage)
         results = html.xpath('//*[@id="results"]')
         if not results:
             return
         totalpage=html.xpath('//*[@id="div_3"]/*[@class="page"]/span/text()')
         if totalpage:
             totalpage = self.r.parse('(\d+)',totalpage[0].split('/')[-1])[0]
         else:
             Logger.getlogging().info("there are no results you want!")
             return
             
         urllist=[]
         if int(totalpage) >= self.maxpages:
             totalpage = self.maxpages
         if totalpage <>'0':
             for pages in range(0,int(totalpage)):
                 searchurl = S2Query.S2_URL % (pages+1,params.customized['key'])
                 urllist.append(searchurl)
                 self.__storeqeuryurllist__(urllist, S2Query.STEP_2,{'key':params.customized['key']})
         else:
             return
     elif params.step == S2Query.STEP_2:
         comquerkey=Common.urldec(params.customized['key']).decode('gbk').encode('utf-8')
         soup = BeautifulSoup(params.content,'html5lib')
         urllist = []
         divs = soup.find_all(attrs={'class':'result f s0'})
         if not divs:
             return
         for div in divs:
             title = div.select_one('h3.c-title').get_text()
             title = ''.join(title.strip().split())
             url_tm = div.select_one('.c-showurl').get_text()
             
             tm = getuniformtime(url_tm.split('/')[-1])
             url = 'http://'+'/'.join(url_tm.split('/')[0:-1])
             Logger.getlogging().debug(title)
             #Logger.getlogging().debug(url_tm)
             if not Common.checktitle(comquerkey, title):
                 Logger.getlogging().warning('{url}:40000 out of range, the title!'.format(url=params.originalurl))
                 continue
             if not compareNow(tm, self.querylastdays):
                 Logger.getlogging().warning('{url}:40000 out of range, the time!'.format(url=params.originalurl))
                 continue
             urllist.append(url)
         self.__storeurllist__(urllist,SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 8
0
    def pageprocess(self, params):
        # 获取文本
        xparser = XPathUtility(params.content)
        # 获取该页超级链接
        hreflist = xparser.xpath('//h3/a/@href')
        hrefs = []
        for mid_url in hreflist:
            mid = self.preprocess(mid_url)
            if mid is not None:
                hrefs.append(mid)

        # 获取该页内容的所有发布时间
        publictime = xparser.xpath('//*[@class="scontent"]/text()[1]')
        publicTimes = []
        for timeindex in publictime:
            middle = str(timeindex).replace('\n', '').replace('\t', '').strip()
            publicTimes.append(
                str(str(middle).split(' ')[0]) + ' ' +
                str(str(middle).split(' ')[1]))
        # 获取改页所有title
        titles = []
        titles_list = xparser.getlist('//h3')
        for title in titles_list:
            mid_title = str(title).replace('\n', '').replace('\t', '').strip()
            titles.append(mid_title)
        # 获取关键字
        KEY_mid = params.customized['KEY']
        KEY = Common.urldec(KEY_mid)
        # 获取标题正则表达式
        titlePatten = KEY
        # 获取一周前日期
        today = datetime.datetime.now()
        before_days = today + datetime.timedelta(-self.inputtime)
        before_arr = str(before_days).split('.')
        before_time = before_arr[0]

        urllist = []
        len_hrefs = len(hrefs)
        number = 0
        for index in publicTimes[:len_hrefs]:
            # 是否是标题命中
            # mid_value = re.compile(titlePatten)
            # flg = mid_value.search(str(titles[number]))
            flg = Common.checktitle(titlePatten, str(titles[number]))
            # 是当前一周内发布视频,并且标题命中的场合
            if index > before_time and flg:
                url = hrefs[number]
                urllist.append(url)
            number = number + 1

        # 获取最终url列表
        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Esempio n. 9
0
 def process(self, params):
     # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL
     if params.step == kumiS2Query.KUMI_S2QUERY_FIRST_PAGE:
         # 获得首页url参数
         titlePatten = params.customized['query']
         # 获取标题正则表达式
         soup = BeautifulSoup(params.content, 'html5lib')
         boxs = soup.find_all(attrs={'class': re.compile('seaResultBox')})
         if boxs:
             urllist = []
             for box in boxs:
                 title = box.select_one('.seaResultA > a').get_text()
                 if not Common.checktitle(titlePatten, title):
                     continue
                 url = box.select_one('.seaResultA > a').get('href')
                 urllist.append(url)
             self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
 def step3(self, params):
     """获取新闻类的url列表"""
     key = Common.urldec(params.customized['query'])
     soup = BeautifulSoup(params.content, 'html5lib')
     lis = soup.select('.wzlist > ul > li.wztitle')
     if lis:
         urllist = []
         for li in lis:
             title = li.select_one('a').get_text()
             # if key not in title:
             if not Common.checktitle(key, title):
                 continue
             pubtime = li.select_one('span').get_text()
             url = 'http://www.52tian.net' + li.select_one('a').get('href')
             if compareNow(getuniformtime(pubtime), self.querylastdays):
                 urllist.append(url)
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
 def step2(self, params):
     """"""
     try:
         key = params.customized['key']
         xhtml = etree.HTML(params.content)
         url = xhtml.xpath('//*[@class="mod_book_cover db"]/@href')
         url = ['http://ac.qq.com' + u for u in url]
         title = xhtml.xpath('//*[@class="mod_book_cover db"]/@title')
         title_url = zip(title, url)
         urllist = []
         for t, u in title_url:
             # if Common.urldec(key) in t:
             if Common.checktitle(Common.urldec(key), t):
                 urllist.append(u)
         if len(urllist) > 0:
             self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS)
     except:
         Logger.printexception()
Esempio n. 12
0
 def getpagecontents(self, params):
     info = params.customized['query']
     # 创建 beautifulsoup 对象
     soup = BeautifulSoup(params.content, 'html.parser')
     # 查找class属性为c-title的h3标记
     results = soup.find_all('h3', 'c-title')
     urllist = []
     if len(results) > 0:
         for result in results:
             # 获取a标记的文本,因search的url中含有sti的参数,这边不考虑时间
             title = (result.find('a').get_text()).strip()
             # if info.decode('utf8') in title.decode('utf8'):
             # if info in title:
             if Common.checktitle(info, title):
                 # 获取a标记的href属性
                 href = result.find('a').get('href')
                 urllist.append(href)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
 def step2(self, params):
     key = params.customized['query']
     query = Common.urldec(key)
     soup = BeautifulSoup(params.content, 'html5lib')
     tbody = soup.select('.search_topic_list > form > table > tbody')
     lis = tbody[-1].select('tr')
     urllist = []
     for li in lis:
         url = li.select_one('.p_title > a').get('href')
         title = li.select_one('.p_title > a').get_text()
         curtime = li.select('td')[3].get_text()
         if TimeUtility.compareNow(TimeUtility.getuniformtime(curtime),
                                   self.querylastdays):
             if Common.checktitle(query, title):
                 urllist.append(url)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
         else:
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Esempio n. 14
0
 def step_last(self, params):
     urllist = []
     info = params.customized['query']
     soup = BeautifulSoup(params.content, 'html5lib')
     divs = soup.select('#results > .result')
     for div in divs:
         publish = div.select_one('.c-summary-1').get_text()
         title = div.select_one('h3 > a').get_text().strip()
         url = div.select_one('h3 > a').get('href').strip()
         #url = self.preprocess(href)
         if TimeUtility.compareNow(TimeUtility.getuniformtime(publish),
                                   self.querylastdays):
             if Common.checktitle(Common.trydecode(info),
                                  Common.trydecode(title)):
                 urllist.append(url)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
         else:
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Esempio n. 15
0
    def getsearchresult(self, params):
        info = params.customized['query']
        soup = BeautifulSoup(params.content, 'html5lib')
        lis = soup.select('ul.ckl_cktpp > li.cfix')
        urllist = []
        if lis:
            for li in lis:
                title = li.select_one('h3').get_text()
                # if info not in title:
                if not Common.checktitle(info, title):
                    continue
                times = li.select('p')[-2].get_text()
                times = getuniformtime(times)
                url = li.select_one('h3 > a').get('href')
                if compareNow(times, self.querylastdays):
                    urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)

            return len(urllist)
        else:
            return -1
Esempio n. 16
0
 def step2(self, params):
     key = params.customized['key']
     query = Common.urldec(key)
     soup = BeautifulSoup(params.content, 'html5lib')
     lis = soup.select('.sresult > ul > li')
     urllist = []
     for li in lis:
         url = li.select_one('.stitle > a').get('href')
         title = li.select_one('.stitle').get_text()
         curtime = li.select_one('.scontent').get_text()
         if TimeUtility.compareNow(TimeUtility.getuniformtime(curtime),
                                   self.querylastdays):
             if Common.checktitle(query, title):
                 urllist.append('http://bbs.tgbus.com/' + url)
             else:
                 Logger.log('http://bbs.tgbus.com/' + url,
                            constant.ERRORCODE_WARNNING_NOMATCHTITLE)
         else:
             Logger.log('http://bbs.tgbus.com/' + url,
                        constant.ERRORCODE_WARNNING_NOMATCHTIME)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
    def process(self, params):
        if params.step == KanKanS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回内容,通过xpath: //*[@data-search-page="item"] 得到最大page数、(返回数组的倒数第二位)
            info = params.customized['query']
            limit = params.customized['limit']
            html = etree.HTML(params.content)
            if not html.xpath('//*[@id="video_box"]'):
                return
            nodes = html.xpath('//p[@class="list-pager-v2"]/a[last()-1]')
            # 获取不到,则返回
            if len(nodes) == 0:
                page_count = 1
            else:
                page_count = int(nodes[0].text)
            # 根据上面的page_count数,拼出所有的搜索结果url(最新1周)
            querylist = []
            if page_count > 0:
                for page in range(1, page_count + 1, 1):
                    url = KanKanS2Query.KANKAN_QUERY_TEMPLATE.format(
                        keyword=info, page=page, limit=limit)
                    querylist.append(url)
                self.__storeqeuryurllist__(querylist,
                                           KanKanS2Query.S2QUERY_EACH_PAGE,
                                           {'query': info})

        elif params.step == KanKanS2Query.S2QUERY_EACH_PAGE:
            info = params.customized['query']
            soup = BeautifulSoup(params.content, 'lxml')
            results = soup.find_all('p', 'title')
            urllist = []
            for result in results:
                title = result.find('a').get('title')
                # if info in title:
                if Common.checktitle(info, title):
                    #因search的url中含有limit=7(包含一周)的参数,这边不考虑时间
                    href = result.find('a').get('href')
                    urllist.append(href)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
 def step2(self, params):
     """"""
     info = params.customized['query']
     info = Common.urldec(info)
     soup = BeautifulSoup(params.content, 'html5lib')
     videos = soup.select('.uiVideo > .uiVideo__item')
     if videos:
         urllist = []
         for video in videos:
             title = video.select_one('h3 > a').get('title')
             pubtime = video.select('.result__data > span')[-1].get_text()
             url = video.select_one('h3 > a').get('href')
             # if not info in title:
             if compareNow(getuniformtime(pubtime), self.querylastdays):
                 if Common.checktitle(info, title):
                     urllist.append(url)
                 else:
                     Logger.log(url,
                                constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def step2(self, params):
        """解析每一搜索页面"""
        info = params.customized['info']
        soup = BeautifulSoup(params.content, 'html5lib')
        divs = soup.select('.pbw')

        #divs = soup.select('h3.xs3 > a')
        if not divs:
            return
        urllist = []

        for div in divs:
            tm = div.select('p > span')[0].get_text()
            tm = TimeUtility.getuniformtime(tm)
            geturl = div.select_one('h3.xs3 > a').get('href')
            title = div.select_one('h3.xs3 > a').get_text()
            if not re.search('http://.*com.*', geturl):
                if re.search('(http://.*com).*', params.url):
                    urltemp = re.findall('(http://.*com).*', params.url)[0]
                elif re.search('(http://.*cn).*', params.url):
                    urltemp = re.findall('(http://.*cn).*', params.url)[0]
                elif re.search('(http://.*net).*', params.url):
                    urltemp = re.findall('(http://.*net).*', params.url)[0]
                geturl = urltemp + '/' + geturl
            if re.search('(http.*)&highlight', geturl):
                geturl = re.findall('(http.*)&highlight', geturl)[0]

            Logger.getlogging().info(Common.trydecode(title))
            #to compare time and match title
            if not TimeUtility.compareNow(tm, self.querylastdays):
                Logger.log(geturl, constant.ERRORCODE_WARNNING_NOMATCHTIME)
                continue
            if not Common.checktitle(Common.trydecode(info),
                                     Common.trydecode(title)):
                Logger.log(geturl, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                continue
            #print geturl
            urllist.append(geturl)
        self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Esempio n. 20
0
    def process(self, params):
        # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL
        if params.step == SinaS2Query.SINA_S2QUERY_FIRST_PAGE:
            jsdata = json.loads(params.content)
            count = int(jsdata['total'])
            querylist = []
            if count > 0:
                for page in range(1, int(math.ceil(float(count) / SinaS2Query.DEFAULT_PAGE_SIZE)) + 1, 1):
                    url = SinaS2Query.SINA_QUERY_TEMPLATE.format(
                        q=Common.urlenc(params.customized['query']),
                        ps=page,
                        pn=SinaS2Query.DEFAULT_PAGE_SIZE,
                        pf=self.querylastdays)
                    querylist.append(url)
                self.__storeqeuryurllist__(querylist, SinaS2Query.SINA_S2QUERY_EACH_PAGE, {'query': params.customized['query']})
        # 从查询页面中获取视频URL
        elif params.step == SinaS2Query.SINA_S2QUERY_EACH_PAGE:
            jsdata = json.loads(params.content)
            searchlist = jsdata['list']
            urllist = []
            try:
                for search in searchlist:
                    # if params.customized['query'].decode(CHARSET_UTF8) in search['videoname']:
                    title = search['videoname']
                    if re.search('<.*>[\s\S]*?</.*>',title):
                        soup = BeautifulSoup(title,'html5lib')
                        title = soup.get_text()
                        title = ''.join(title.strip().split())
                        Logger.getlogging().debug(title)
                    #if Common.checktitle(params.customized['query'], search['videoname']):
                    if Common.checktitle(params.customized['query'], title):
                        urllist.append(search['url'])

                if len(urllist) > 0:
                    self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
            except:
                Logger.printexception()
 def step2(self, params):
     """"""
     query = params.customized['query']
     soup = BeautifulSoup(params.content, 'html.parser')
     trs = soup.select('#schend')
     if not trs:
         return
     urllist = []
     for tr in trs:
         title = tr.select_one('.sb14b').get_text()
         content = etree.HTML(str(tr))
         publicTimes = content.xpath(
             '//*[@id="schend"]/table[1]/tr/td[3]/text()')[-1].strip()
         href = tr.select_one('.sb14b').get('href')
         id = re.findall('id=(\d+)&', href)[0]
         url = 'http://forum.home.news.cn/detail/' + id + '/1.html'
         if not compareNow(getuniformtime(publicTimes), self.querylastdays):
             continue
         if not Common.checktitle(Common.trydecode(query),
                                  Common.trydecode(title)):
             continue
         urllist.append(url)
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
 def step2(self, params):
     """"""
     info = params.customized['query']
     soup = BeautifulSoup(params.content, 'html5lib')
     if soup.select_one('.sk_null > .sorry'):
         Logger.getlogging().warning('{0}:40000 No urllist!'.format(
             params.url))
         return
     html = etree.HTML(params.content)
     hrefs = html.xpath('//*[@class="v-link"]/a/@href')
     titles = html.xpath('//*[@class="v-link"]/a/@title')
     urllist = []
     for index in range(0, len(titles), 1):
         if titles[index]:
             Logger.getlogging().debug(titles[index])
             # match title
             if not Common.checktitle(Common.urldec(info), titles[index]):
                 Logger.getlogging().debug(
                     'http:+{url}:40000 checktitle,out of title'.format(
                         url=hrefs[index]))
                 continue
             urllist.append('http:' + hrefs[index])
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 23
0
    def pageprocess(self, params):
        # Step3: 根据Step2的返回jason数据,获取
        # 标题:jsondata['data'][0开始到19]['title']
        # 连接:jsondata['data'][0开始到19]['url']
        # 视频发布时间:jsondata['data'][0开始到19]['modifydatel'] 这个需要截断前10位,只能对比日期

        info = params.customized['query']
        jsondata = json.loads(params.content)
        searchresult = jsondata['data']

        urllist = []
        for result in searchresult:
            title = result['title']
            url = result['url']
            pubtime = result['modifydate']
            # if not info in title:
            if compareNow(getuniformtime(pubtime), self.querylastdays):
                if Common.checktitle(info, title):
                    urllist.append(self.MAIN_DOMAIN + url)
                else:
                    Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
            else:
                Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
        self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 24
0
    def processVideo(self, params):
        if params.step == MofangS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回json内容,comments['totalnums'] 得到视频总数
            #一个json返回20条数据,需要使用总数除以20获得总页数,然后在写到page参数里面
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            try:
                jsondate = json.loads(params.content)
                comments_count = jsondate['totalnums']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return
            # 获取不到,则返回
            if int(comments_count) == 0:
                return

            page_count = int(
                math.ceil(float(comments_count) / self.DEFAULT_PAGE_SIZE))
            # 根据上面的page_count数,拼出所有的搜索结果url(最新1周)
            querylist = []
            if page_count > 0:
                for page in range(1, page_count + 1, 1):
                    url = MofangS2Query.QUERY_TEMPLATE.format(
                        key=keyvalue,
                        pageno=page,
                        pagesize=self.DEFAULT_PAGE_SIZE)
                    Logger.getlogging().debug(url)
                    querylist.append(url)
                self.__storeqeuryurllist__(querylist,
                                           MofangS2Query.S2QUERY_EACH_PAGE,
                                           {'query': info})

        elif params.step == MofangS2Query.S2QUERY_EACH_PAGE:
            # Step3: 根据Step2的返回jason数据,获取
            # 标题:comments['data'][0开始到19]['title']
            # 连接:comments['data'][0开始到19]['url']
            # 视频发布时间:comments['data'][0开始到19]['inputtime'] 这个需要截断前10位,只能对比日期

            info = params.customized['query']
            try:
                jsondate = json.loads(params.content)
                searchresult = jsondate['data']
            except:
                Logger.getlogging().warning('{}:40000'.format(params.url))
                return

            # 获取当前日(日期类型)
            today = datetime.datetime.strptime(TimeUtility.getcurrentdate(),
                                               TimeUtility.DATE_FORMAT_DEFAULT)

            urllist = []
            for index in range(0, len(searchresult), 1):
                #print searchresult[index]['title']
                #print searchresult[index]['inputtime']
                if searchresult[index]['title'] is not None:
                    # 标题中包含指定要查询的关键字,对应的url保存
                    # if searchresult[index]['title'].find(info) > -1:
                    if Common.checktitle(info, searchresult[index]['title']):
                        if searchresult[index]['inputtime'] is not None:
                            #inputtime = datetime.datetime.strptime(TimeUtility.getuniformtime2(int(searchresult[index]['inputtime'])), TimeUtility.TIME_FORMAT_DEFAULT)
                            #intervaldays = today - inputtime
                            #if intervaldays.days <= int(self.querylastdays):
                            pubtime = getuniformtime(
                                str(searchresult[index]['inputtime']))

                            if compareNow(pubtime, int(self.querylastdays)):
                                urllist.append(searchresult[index]['url'])
                        else:
                            # 获取不到发布时间,则默认为周期以内
                            urllist.append(searchresult[index]['url'])

            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 25
0
 def checktitle(self, query, title):
     return Common.checktitle(query, title)
    def step3(self, params):
        """解析每一页面"""
        try:
            key = Common.urldec(params.customized['key'])
            soup = BeautifulSoup(params.content, 'html.parser')
            urllist = []
            #1.判断是合集还是单集
            #2.判断是电视剧、综艺、电影
            #<span class="type">电视剧\综艺\电影<span>
            #3.判断跟新数量
            #合集,电视剧class="result_episode_list cf"或综艺class="result_link_list cf"[纯综艺和电视剧综艺]
            items_v = soup.find_all(
                attrs={'class': re.compile('result_item_v')})
            if items_v:
                for item in items_v:
                    flag = None
                    title = item.select_one('h2.result_title > a')
                    # if key not in title.get_text():
                    if not Common.checktitle(key, title.get_text()):
                        continue
                    #判断是否是电视剧,flag = 'tv',需要reversed序列,按从大到小排序,在剔除预告片
                    videoObj = None
                    tvObj = item.select_one('.result_episode_list')
                    if tvObj:
                        flag = 'tv'
                        videoObj = tvObj
                    #判断是否是综艺,flag = 'vv'    'variaty vedio'
                    vvObj = item.select_one('.result_link_list')
                    if vvObj:
                        flag = 'vv'
                        videoObj = vvObj
                    #针对flag='tv',flag='vv'
                    if videoObj:
                        #正确的url序列
                        tvs = videoObj.select('div.item > a')
                        if flag == 'tv':
                            tvs = reversed(tvs)
                        tvs = [
                            tv for tv in tvs
                            if re.findall('^http[s]{0,1}://', tv.get('href'))
                        ]

                        #计数'预告'个数
                        imgmarkObjs = videoObj.select('.item > .mark_v > img')
                        altlen = 0
                        if imgmarkObjs:
                            for i in range(len(imgmarkObjs)):
                                alt = imgmarkObjs[i].get('alt')
                                if re.search('预告', alt) or re.search(
                                        u'预告', alt):
                                    altlen += 1
                        #剔除预告片
                        urlitems = tvs[altlen:altlen + 3]
                        for urlitem in urlitems:
                            #print 'uuuuuuuuuuuuuuuuuuuu:',urlitem.get('href')
                            url = urlitem.get('href')
                            if re.search('http://v.qq.com/x/page/\w+\.html',
                                         url):
                                #url = url.replace('page','cover/lvxqk7s7yynbdba')
                                url = url.replace('http://', 'https://')
                            urllist.append(url)
                        continue

                    #如果以上都不是,(可以继续判断是否是电影,flag = 'mv'    预告与花絮不抓取,只抓取主链接)
                    otherObj = item.select_one('h2.result_title > a')
                    if otherObj:
                        if re.search('电影', otherObj.get_text()) or re.search(
                                u'电影', otherObj.get_text()):
                            flag = 'mv'
                        url = otherObj.get('href')
                        if re.search('http://v.qq.com/x/page/\w+\.html', url):
                            ##url = url.replace('page','cover/lvxqk7s7yynbdba')
                            url = url.replace('http://', 'https://')
                        urllist.append(url)
                        continue

            #单集
            items_h = soup.find_all(
                attrs={'class': re.compile('result_item_h')})
            if items_h:
                for item in items_h:
                    title = item.select_one('h2.result_title > a')
                    # if key not in title.get_text():
                    if not Common.checktitle(key, title.get_text()):
                        continue
                    url = item.select_one('h2.result_title > a').get('href')
                    if re.search('^http[s]{0,1}://', url):
                        if re.search('http://v.qq.com/x/page/\w+\.html', url):
                            url = url.replace('http://', 'https://')
                        urllist.append(url)
            #系列
            items_series = soup.find_all(
                attrs={'class': re.compile('result_series')})
            if items_series:
                for item in items_series:
                    title = item.select_one('.figure_title > a')
                    # if key not in title.get_text():
                    if not Common.checktitle(key, title.get_text()):
                        continue
                    urlitems = item.select('.list_item > a')
                    for urlitem in urlitems:
                        url = urlitem.get('href')
                        if re.search('http://v.qq.com/x/page/\w+\.html', url):
                            #url = url.replace('page','cover/lvxqk7s7yynbdba')
                            url = url.replace('http://', 'https://')
                        urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
        except:
            Logger.printexception()
    def process(self, params):
        if params.step == Cine107S2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回内容,得到搜索结果总数
            info = params.customized['query']
            soup = BeautifulSoup(params.content, 'html5lib')
            # 搜素结果
            #print soup
            results = soup.select('#results')[0]
            # 无查找结果,则返回
            if results.get_text().find('抱歉') > -1:
                return
            else:
                resultStr = results.select('span.support-text-top')[0].get_text().strip()
                resultStr = resultStr[8:resultStr.index('个')]
                if resultStr.find(',') > -1:
                    result_counts = int(resultStr.replace(',', ''))
                else:
                    result_counts = int(resultStr)
                    Logger.getlogging().debug(result_counts)
                #搜索结果只能查看75[0:74]页,如果超过75页,按照75页处理
                if result_counts > 750:
                    result_counts = 750

            #计算出循环页数page_count
            if result_counts < 10:
                page_count = 0
            else:
                page_count = int(math.ceil(result_counts / Cine107S2Query.DEFAULT_PAGE_SIZE))
            # 根据上面的page_count数,拼出所有的搜索结果url
            querylist = []
            for page in range(0, page_count):
                url = Cine107S2Query.QUERY_TEMPLATE.format(key=info, pageno=page)
                querylist.append(url)
            self.__storeqeuryurllist__(querylist, Cine107S2Query.S2QUERY_EACH_PAGE, {'query': info})

        elif params.step == Cine107S2Query.S2QUERY_EACH_PAGE:
            # Step3: 根据Step2的url,获取搜索结果的url,把url写入文件
            info = params.customized['query']
            soup = BeautifulSoup(params.content, 'html.parser')
            titles = soup.select('h3.c-title')
            times = soup.select('span.c-showurl')
            urllist = []
            index = 0
            for result in titles[0:]:
                title = result.get_text().strip()
                nodeUrl = result.select('a')[0].get('href')
                timeStr = times[index].get_text().strip()
               
                if timeStr.find('html') > -1:
                    timeStr = timeStr[timeStr.index('html') + 5:]
                elif timeStr.find('...') > -1:
                    timeStr = timeStr[timeStr.index('...')+4:]
                # 标题中包含指定要查询的关键字,并且是一周内的帖子,对应的url保存
                if self.r.search(ur'(\d+-\d+-\d+)', timeStr):
                    timeStr = self.r.parse(ur'(\d+-\d+-\d+)', timeStr)[0]
                #if title.find(Common.urldec(info)) > -1 and TimeUtility.getuniformtime(timeStr, '%Y-%m-%d') > TimeUtility.getuniformdatebefore(7):
                    #urllist.append(nodeUrl)
                #Logger.getlogging().debug(title)
                #Logger.getlogging().debug(nodeUrl)  
                #Logger.getlogging().debug(Common.urldec(info))
                #Logger.getlogging().debug(title)
                if  compareNow(getuniformtime(timeStr),self.querylastdays):
                    if Common.checktitle(Common.urldec(info), title):
                        urllist.append(nodeUrl) 
                index += 1
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Esempio n. 28
0
    def process(self, params):
        if params.step == HuyaS2Query.S2QUERY_FIRST_PAGE:
            Logger.getlogging().debug("HuyaS2Query.S2QUERY_FIRST_PAGE")
            #Step2: 根据返回的html,通过xpath://*[@id="tab1"]/div[1]/div/span/em 得到搜索结果总件数
            # 根据总件数,计算出page总数(总件数/20件,除不尽向上取整)拼出搜索结果的url,写文件保存
            soup = BeautifulSoup(params.content, 'html5lib')
            if soup.select('.search-no-data-wrap'):
                return
            # 获取不到,则返回
            totalstr = soup.select_one('.search-list > .mod-tab-hd > .act')
            if not totalstr:
                return
            # 获取总检索页数(例如:160)
            totalstr = totalstr.get_text().replace(',', '')
            count = int(re.findall('\d+', totalstr)[0])

            # 根据上面的count数,拼出所有的搜索结果url
            info = params.customized['query']
            keyvalue = Common.urlenc(info)
            querylist = []
            pagecount = float(count) / self.DEFAULT_PAGE_SIZE
            pages = int(math.ceil(pagecount))
            if pages >= self.maxpages:
                pages = self.maxpages
            for page in range(1, pages + 1, 1):
                url = HuyaS2Query.QUERY_TEMPLATE.format(pageno=page,
                                                        key=keyvalue)
                Logger.getlogging().debug(url)
                querylist.append(url)
            self.__storeqeuryurllist__(querylist,
                                       HuyaS2Query.S2QUERY_EACH_PAGE,
                                       {'query': info})

        elif params.step == HuyaS2Query.S2QUERY_EACH_PAGE:
            info = params.customized['query']
            soup = BeautifulSoup(params.content, 'html5lib')
            if soup.select('.search-no-data-wrap'):
                return
            divs = soup.select('ul.video-list')
            if divs:
                divs = divs[-1]
                divs = divs.select('li')
            if not divs:
                return

            urllist = []
            for div in divs:
                video = div.select_one('.video-title > .video-wrap')
                timestr = div.select_one('.result-data')
                times = getuniformtime(timestr.get_text())
                titles = video.get('title')
                url = video.get('href')
                if compareNow(times, self.querylastdays) and Common.checktitle(
                        info, titles):
                    Logger.getlogging().debug(titles)
                    Logger.getlogging().debug(url)
                    urllist.append(url)
                else:
                    Logger.getlogging().debug(
                        titles + ' not match title or out of time')
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
    def process(self, params):
        urllist = []
        if params.step == ZolBbsS2Query.S2QUERY_FIRST_PAGE:
            #Step2: 根据返回内容,通过xpath: //*[@data-search-page="item"] 得到最大page数、(返回数组的倒数第二位)
            info = params.customized['info']
            keyvalue = params.customized['query']
            cdate = params.customized['cdate']

            #keyvalue = Common.urlenc(info)

            html = etree.HTML(params.content)
            totalstr = html.xpath('//span[@class="search-title"]/text()')
            totalcount = re.sub("\D", "", totalstr[1])
            #print totalcount
            # 获取不到,则返回
            if int(totalcount) == 0:
                return

            # 根据上面的page_count数,拼出所有的搜索结果url(最新1周)
            querylist = []
            if totalcount > 0:
                for page in range(
                        1,
                        int(
                            math.ceil(
                                float(totalcount) / self.DEFAULT_PAGE_SIZE)) +
                        1, 1):
                    url = ZolBbsS2Query.ZOLBBS_QUERY_TEMPLATE.format(
                        kword=keyvalue, cdate=cdate, page=page)
                    querylist.append(url)
                self.__storeqeuryurllist__(querylist,
                                           ZolBbsS2Query.S2QUERY_EACH_PAGE, {
                                               'query': keyvalue,
                                               'info': info
                                           })

        elif params.step == ZolBbsS2Query.S2QUERY_EACH_PAGE:
            info = params.customized['info']
            keyvalue = params.customized['query']
            html = etree.HTML(params.content)
            soup = BeautifulSoup(params.content, 'lxml')
            results = soup.find_all('ul', 'results-list')

            for result in results:
                restr = str(result.find('a', ''))
                value = self.r.parse(
                    '<a href="(.*)" target="_blank" title="(.*)">.*</a>',
                    restr)[0]
                if value:
                    if value[1] is not None:
                        title = value[1].strip()
                        if Common.checktitle(Common.trydecode(info),
                                             Common.trydecode(title)):
                            href = value[0]
                            #如果href是以http开始,http://bbs.zol.com.cn/sjbbs/d34130_156664.htm
                            if self.r.parse(r'^http://.*', href):
                                urllist.append(href)
                            #如果href是以/开头,/quanzi/d643_841594.html,则拼上http://bbs.zol.com.cn
                            elif self.r.parse(r'^/.*', href):
                                href = self.MAIN_URL + href
                                urllist.append(href)
                            else:
                                return
        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)