Esempio n. 1
0
    def step2(self, params):
        """"""
        try:
            key = params.customized['key']
            soup = BeautifulSoup(params.content, 'html5lib')
            #print soup
            #searchListOne = soup.select('.searchListOne > ul')
            searchListOne = soup.select('.searchListOne > ul > li > div')
            if not searchListOne:
                Logger.getlogging().warning('{}:40000 No urllist'.format(
                    params.originalurl))
                return
            lis = soup.select(
                '.searchListOne > ul > li'
            )[:-1]  #最后一个<li id=search_msg style="display:none"></li>,过滤掉
            urllist = []
            for li in lis:
                url = li.select_one('h3 > a').get('href')
                #print '*********',url
                tm = li.select('.source > span')[0].get_text()
                tm = getuniformtime(tm)
                now = getuniformtime(str(time.time()))
                cmt_num = li.select('.source > span')[-1].get_text()

                title = li.select_one('h3').get_text()
                if Common.checktitle(Common.urldec(key), title):
                    if compareNow(tm, self.querylastdays):
                        urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
        except:
            #traceback.print_exc()
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))
Esempio n. 2
0
 def _create_direcotry(self):
     if self._dbPath:
         directory = os.path.abspath(self._dbPath)
         directory = os.path.split(directory)[0]
         Common.create_dir(directory)
     else:
         Log.error('empty db path')
 def step2(self, params):
     """"""
     q = params.customized['query']
     soup = BeautifulSoup(params.content, 'html5lib')
     divs = soup.select('.videobox')
     if not divs:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     urllist = []
     for div in divs:
         title = div.select_one('.title').get_text()
         #print title
         tm = getuniformtime(div.select_one('.date').get_text())
         url = div.select_one('.title > a').get('href')
         Logger.getlogging().debug(title)
         if not compareNow(tm, self.querylastdays):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
             continue
         if not Common.checktitle(Common.urldec(q), title):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             continue
         urllist.append(url)
         #获取最终url列表
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 4
0
    def request_string(self,
                       url,
                       headers=None,
                       data=None,
                       method=None,
                       encoding=None,
                       cache=None,
                       retryTimes=None):
        if not retryTimes:
            retryTimes = 1
        if cache is None:
            cache = Common.debug()

        response = None
        for _ in range(retryTimes):
            response = self.request_data(url,
                                         headers=headers,
                                         data=data,
                                         method=method,
                                         cache=cache)
            if response is not None:
                break

        s = None
        if response is not None:
            if encoding:
                s = Common.decode_data(response, encoding=encoding)
            else:
                s = Common.decode_data(response, encoding='utf-8')
                if not s:
                    s = Common.decode_data(response, encoding='gbk')
        return s
Esempio n. 5
0
 def analysis(self, line, method):
     try:
         js = json.loads(line)
         param = ProcessParam()
         param.crawler_time = TimeUtility.getuniformtime(js['crawler_time'])
         param.url = Common.urldec(js['foundin'])
         param.content = js['html']
         if method == constant.REQUEST_TYPE_POST:
             param.data = js['data']
         if js['html'][:3] == constant.GZIP_CODE:
             param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
         # decode
         content = Common.urldec(param.content)
         charset = RegexUtility.getid('charset', content)
         content = Common.trydecode(content, charset)
         param.content = content
         return param
     except:
         line = line.replace('\n', '').strip()
         if not line or line[0] == '#':
             return
         Logger.getlogging().debug(line)
         param = ProcessParam()
         param.url = line
         if method == constant.REQUEST_TYPE_POST:
             js = json.loads(line)
             param.url = js['url']
             param.data = js['data']
         param.content = HttpCacher.getcontent(line, method)
         if param.content is None:
             return
         return param
Esempio n. 6
0
    def step2(self, params):
        info = Common.urldec(params.customized['info'])
        soup = BeautifulSoup(params.content, 'html5lib')
        text_divs = soup.select('.s_r_txt')
        urllist = []

        if text_divs:
            for item in text_divs:
                title = item.select_one('h3 > a').get_text()
                url = item.select_one('h3 > a').get('href')
                curtime = item.select('p')[-1].get_text().strip()
                try:
                    if TimeUtility.compareNow(
                            TimeUtility.getuniformtime(curtime),
                            self.querylastdays):
                        if Common.checktitle(info, title):
                            urllist.append(url)
                        else:
                            Logger.log(
                                url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                    else:
                        Logger.log(url,
                                   constant.ERRORCODE_WARNNING_NOMATCHTIME)
                except:
                    urllist.append(url)
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
 def get(self, url):
     saveJson = {}
     try:
         Logger.getlogging().debug('Downloading: {url}'.format(url=url))
         request = urllib2.Request(url, headers=self.headers)
         response = urllib2.urlopen(request, timeout=self.timeout)
         code = response.getcode()
         info = response.info()
         # 判断返回的code,如果不是200,则返回空
         if code == 200:
             html = response.read()
             if (("Content-Encoding" in info) and (info['Content-Encoding'] == "gzip")):
                 html = zlib.decompress(html, 16 + zlib.MAX_WBITS);
             Logger.getlogging().debug('Request Sucessed: {url}'.format(url=url))
         else:
             Logger.getlogging().error('open {url} error, code = {code}'.format(url=url, code=code))
             Logger.getlogging().error('Request Failed: {url}'.format(url=url))
             return None
     except:
         Logger.getlogging().error('Request   Failed: {url}'.format(url=url))
         Logger.printexception()
         return None
     charset = RegexUtility.getid('charset', html)
     html = Common.trydecode(html, charset)
     saveJson['foundin'] = Common.urlenc(url)
     saveJson['html'] = Common.urlenc(html.encode(constant.CHARSET_UTF8))
     saveJson['crawler_time'] = int(time.time())
     jsonStr = json.dumps(saveJson)
     return jsonStr     
 def storecmt(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     if not CMTStorage.exist(url, content, pubdate, user):
         Logger.getlogging().debug(
             'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'.
             format(url=url, content=content, pubdate=pubdate, user=user))
         id = CMTStorage.getid(url, content, pubdate, user)
         data = {
             SQLDAO.SPIDER_TABLE_COMMENTS_ID:
             id,
             SQLDAO.SPIDER_TABLE_COMMENTS_URL:
             url,
             SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE:
             pubdate,
             SQLDAO.SPIDER_TABLE_COMMENTS_USER:
             user,
             SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT:
             content,
             SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_COMMENTS,
             SQLDAO.SPIDER_TABLE_COMMENTS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
Esempio n. 9
0
    def __init__(self, files, distdir):
        super(Package, self).__init__()
        self.files = files
        self.distdir = distdir
        Common.create_dir(distdir)

        Log.debug('packages: %s' % files)
        Log.debug('distdir: %s' % distdir)
 def query(self,info):
     Logger.getlogging().info("query")
     querykey=Common.urlenc(Common.trydecode(info).encode('gbk'))
     #querykey = Common.urlenc(info)
     query_url = [S2Query.S2_URL % ('1',querykey)]
     Logger.getlogging().debug(query_url[0])
     self.__storeqeuryurllist__(query_url, self.STEP_1, 
                                {'key':querykey})
 def getid(url):
     idformat = '{machine}_{query}_{url}_{starttime}'
     id = idformat.format(
         machine=NewsStorage.LOCALMACHINEFLAG,
         query=Common.urlenc(SpiderConfigure.getinstance().getquery()),
         url=Common.urlenc(url),
         starttime=SpiderConfigure.getinstance().starttime())
     return Common.md5(id)
Esempio n. 12
0
def backup_with_config(config_path):
    content = Common.read_file(config_path)
    arr = Common.str2json(content)
    if arr:
        for x in arr:
            backup(x.get('src'),
                   x.get('dst_dir'),
                   x.get('retemtion_days'),
                   hours_last_day=x.get('hours_last_day'),
                   ignore_hours=x.get('ignore_hours'))
Esempio n. 13
0
 def query(self, info):
     Logger.getlogging().info("query")
     #keyvalue = Common.urlenc(info)
     keyvalue = Common.urlenc(Common.trydecode(info))
     # step1: 根据key, 拼出下面的url
     # http://q1.fun.tv/ajax/filter_videos/?c=0&p={pageno}&word={key}
     url = FunS2Query.QUERY_TEMPLATE.format(pageno=1, key=keyvalue)
     urls = [url]
     Logger.getlogging().debug(urls[0])
     self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE,
                                {'query': info})
 def  process(self,params):
     if params.step == S2Query.STEP_1:
         html=etree.HTML(params.content)
         #try:
             #quit=html.xpath['//div[@id="results"]/text()']
             #totalpage='0'
         #except:
             #totalpage=html.xpath('//div[@class="page"]/span/text()')[0]
             #totalpage= totalpage.split("/")[-1]
             #totalpage=re.sub("\D", "",totalpage)
         results = html.xpath('//*[@id="results"]')
         if not results:
             return
         totalpage=html.xpath('//*[@id="div_3"]/*[@class="page"]/span/text()')
         if totalpage:
             totalpage = self.r.parse('(\d+)',totalpage[0].split('/')[-1])[0]
         else:
             Logger.getlogging().info("there are no results you want!")
             return
             
         urllist=[]
         if int(totalpage) >= self.maxpages:
             totalpage = self.maxpages
         if totalpage <>'0':
             for pages in range(0,int(totalpage)):
                 searchurl = S2Query.S2_URL % (pages+1,params.customized['key'])
                 urllist.append(searchurl)
                 self.__storeqeuryurllist__(urllist, S2Query.STEP_2,{'key':params.customized['key']})
         else:
             return
     elif params.step == S2Query.STEP_2:
         comquerkey=Common.urldec(params.customized['key']).decode('gbk').encode('utf-8')
         soup = BeautifulSoup(params.content,'html5lib')
         urllist = []
         divs = soup.find_all(attrs={'class':'result f s0'})
         if not divs:
             return
         for div in divs:
             title = div.select_one('h3.c-title').get_text()
             title = ''.join(title.strip().split())
             url_tm = div.select_one('.c-showurl').get_text()
             
             tm = getuniformtime(url_tm.split('/')[-1])
             url = 'http://'+'/'.join(url_tm.split('/')[0:-1])
             Logger.getlogging().debug(title)
             #Logger.getlogging().debug(url_tm)
             if not Common.checktitle(comquerkey, title):
                 Logger.getlogging().warning('{url}:40000 out of range, the title!'.format(url=params.originalurl))
                 continue
             if not compareNow(tm, self.querylastdays):
                 Logger.getlogging().warning('{url}:40000 out of range, the time!'.format(url=params.originalurl))
                 continue
             urllist.append(url)
         self.__storeurllist__(urllist,SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 15
0
    def scale_image_file(cls, src, dst, newWidth):
        if not Common.isfile(src):
            return

        Common.remove(dst)

        img = Image.open(src)
        size = img.size
        print(size)

        newHeight = newWidth * size[1] / size[0]
        img.resize((newWidth, int(newHeight)), Image.ANTIALIAS).save(dst)
 def exist(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     id = CMTStorage.getid(url, content, pubdate, user)
     if id in CMTStorage.__cidset:
         return True
     if SQLDAO.getinstance().exists(SQLDAO.SPIDER_TABLE_COMMENTS,
                                    {SQLDAO.SPIDER_TABLE_COMMENTS_ID: id}):
         CMTStorage.__cidset.add(id)
         return True
     return False
    def seturlinfos(params):
        id = NewsStorage.getid(params.url)
        if NewsStorage.exist(params.url):
            doc = NewsStorage.getdoc(params.url)
            data = {}
            #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
                       TimeUtility.getintformtime(
                           0)) == TimeUtility.getintformtime(0):
                data[
                    SQLDAO.
                    SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                        params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
            SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                        {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                        data)
        else:
            data = {}
            data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                     params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()

            data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
            data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url
            data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query
            data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel
            data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
            SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                        SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                        SQLDAO.getvaluesfromkeys(data))
Esempio n. 18
0
    def pageprocess(self, params):
        # 获取文本
        xparser = XPathUtility(params.content)
        # 获取该页超级链接
        hreflist = xparser.xpath('//h3/a/@href')
        hrefs = []
        for mid_url in hreflist:
            mid = self.preprocess(mid_url)
            if mid is not None:
                hrefs.append(mid)

        # 获取该页内容的所有发布时间
        publictime = xparser.xpath('//*[@class="scontent"]/text()[1]')
        publicTimes = []
        for timeindex in publictime:
            middle = str(timeindex).replace('\n', '').replace('\t', '').strip()
            publicTimes.append(
                str(str(middle).split(' ')[0]) + ' ' +
                str(str(middle).split(' ')[1]))
        # 获取改页所有title
        titles = []
        titles_list = xparser.getlist('//h3')
        for title in titles_list:
            mid_title = str(title).replace('\n', '').replace('\t', '').strip()
            titles.append(mid_title)
        # 获取关键字
        KEY_mid = params.customized['KEY']
        KEY = Common.urldec(KEY_mid)
        # 获取标题正则表达式
        titlePatten = KEY
        # 获取一周前日期
        today = datetime.datetime.now()
        before_days = today + datetime.timedelta(-self.inputtime)
        before_arr = str(before_days).split('.')
        before_time = before_arr[0]

        urllist = []
        len_hrefs = len(hrefs)
        number = 0
        for index in publicTimes[:len_hrefs]:
            # 是否是标题命中
            # mid_value = re.compile(titlePatten)
            # flg = mid_value.search(str(titles[number]))
            flg = Common.checktitle(titlePatten, str(titles[number]))
            # 是当前一周内发布视频,并且标题命中的场合
            if index > before_time and flg:
                url = hrefs[number]
                urllist.append(url)
            number = number + 1

        # 获取最终url列表
        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Esempio n. 19
0
    def _cleanBuildInfo(self):
        for path in self.files:
            directory = Common.split_path(path)[0]

            Common.remove(Common.join_paths(directory, '__pycache__'))
            Common.remove(Common.join_paths(directory, 'build'))

            specs = [
                x for x in os.listdir(directory)
                if os.path.isfile(x) and os.path.splitext(x)[1] == '.spec'
            ]
            for x in specs:
                path = Common.join_paths(directory, x)
                Common.remove(path)
Esempio n. 20
0
def analysis(line):
    param = ProcessParam()
    js = json.loads(line)
  
    param.url = js['foundin']
    param.content = js['html']
    if js['html'][:3] == constant.GZIP_CODE:
        param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
    # decode
    content = Common.urldec(param.content)
    charset = RegexUtility.getid('charset', content)
    content = Common.trydecode(content, charset)
    param.content = content
    return param
Esempio n. 21
0
    def _buildPys(self):
        for path in self.files:
            f1 = os.path.splitext(
                Common.join_paths(self.distdir,
                                  Common.split_path(path)[-1]))[0]
            f2 = f1 + '.exe'
            Common.remove(f1)
            Common.remove(f2)

            cmd = 'pyinstaller %s -F --distpath %s' % (path, self.distdir)
            Common.system_cmd(cmd, directory=Common.split_path(path)[0])
Esempio n. 22
0
    def getpagecomments(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//*[@class="sosResult"]/strong/a/@href')
        titles = xpath.getlist('//*[@class="sosResult"]/strong/a')
        pubtimes = xpath.xpath('//*[@class="sosResult"]/span/cite[3]')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformtime(
                    pubtimes[index].text).split(' ')[0]
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                # pubtime = datetime.datetime.strptime(pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT)
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= int(self.querylastdays):
                    newurl = self.preprocess(hrefs[index])
                    if newurl is not None:
                        urllist.append(newurl)

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
Esempio n. 23
0
 def s2query(self):
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     s2file = SpiderConfigure.getinstance().gets2file()
     file = FileUtility.getfilename(s2file)
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file
     if FileUtility.exists(s2temppath):
         with open(s2temppath, 'r') as fp:
             querylist = []
             firstline = True
             for strquery in fp.readlines():
                 if firstline:
                     firstline = False
                     if strquery[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         strquery = strquery[3:]
                 strquery = Common.strip(strquery)
                 if not strquery:
                     continue
                 Logger.getlogging().info('S2 {query} start...'.format(query=strquery))
                 self.conf.setquery(strquery)
                 URLStorage.updaterecycle()
                 querylist.append(strquery)
                 for site in self.factory.getall():
                     site.s2query(strquery.replace('&', ' '))
             sitelist = []
             for site in self.factory.getall():
                 if site.exists2():
                     sitelist.append(site)
             SpiderReport.loadquery(querylist)
             SpiderReport.loadsites(sitelist)
Esempio n. 24
0
    def get_list(self, u_id, u_mobile, order_id, ctime_st, ctime_ed,
                 order_type, page, count):
        if order_type == 'all':
            orders, total = yield self.context_repos.order_repo.select_for_background_all(
                u_id, u_mobile, order_id, ctime_st, ctime_ed, page, count)
        else:
            if order_type == 'need_pay':
                state = 0
            elif order_type == 'need_send':
                state = 1
            elif order_type == 'need_receive':
                state = 2
            elif order_type == 'complete':
                state = 3
            elif order_type == 'cancel':
                state = 4
            elif order_type == 'overtime':
                state = 5
            orders, total = yield self.context_repos.order_repo.select_for_background(
                u_id, u_mobile, order_id, state, ctime_st, ctime_ed, page,
                count)

        res = {
            'orders': orders,
            'pagination': Common().pagination(total, page, count)
        }
        raise gen.Return(res)
 def pageprocess(self, params):
     # Step3:根据返回的html,通过xpath://*[@class="scout_anim_titletext"],获得检索结果的标题
     # //*[@class="scout_anim_title"]/div/a/@href,获得检索结果的url
     #Logger.getlogging().debug(params.content)
     indexstart = params.content.find('(')
     indexstop = params.content.rfind(')')
     if indexstart > -1 and indexstop > -1:
         jsonvalue = params.content[indexstart + 1:indexstop]
         jsondata = json.loads(jsonvalue)
         info = params.customized['query']
         soup = BeautifulSoup(jsondata['content'], 'html5lib')
         uls = soup.select('.scout_anim_odd > .scout_anim_odd_ul')
         if uls:
             for ul in uls:
                 #titles = ul.select_one('.scout_anim_titletext')
                 titles = ul.select_one('.scout_anim_titletext').get_text()
                 Logger.getlogging().debug(titles)
                 # if info not in titles:
                 if not Common.checktitle(info, titles):
                     return
                 content = ul.select('.scout_anim_content > div > ul > li')
                 if content:
                     if len(content) > 3:
                         content = content[-3:]
                     urllist = [
                         'https://donghua.dmzj.com' +
                         item.find('a').get('href') for item in content
                     ]
                     self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
Esempio n. 26
0
    def query(self, info):
        Logger.getlogging().info("AngeeksS2Query.query")
        keyvalue = Common.urlenc(info)

        # step1: 根据key, 拼出下面的url
        if int(self.querylastdays) <= 7:
            datevalue = self.WEEKLY
        elif int(self.querylastdays) <= 30:
            datevalue = self.MONTHLY
        else:
            datevalue = None

        if datevalue is None:
            urls = [
                AngeeksS2Query.QUERY_TEMPLATE_ALL.format(key=keyvalue, page=0)
            ]
        else:
            urls = [
                AngeeksS2Query.QUERY_TEMPLATE.format(key=keyvalue,
                                                     page=0,
                                                     date=datevalue)
            ]

        Logger.getlogging().debug(urls[0])
        self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE, {
            'query': info,
            'date': datevalue
        })
Esempio n. 27
0
 def gets2url(self, params):
     # 获取文本
     contents = json.loads(params.content)
     query = Common.urldec(params.customized['query'])
     urllist = []
     for item in contents['video_list']:
         try:
             vid = item['vid']
             if item.get('categoryName', '') == u"体育":
                 url = 'http://sports.le.com/video/{vid}.html'.format(
                     vid=vid)
             else:
                 url = 'http://www.le.com/ptv/vplay/{vid}.html'.format(
                     vid=vid)
             curtime = item['ctime']
             #print TimeUtility.getuniformtime(curtime)
             title = item['name']
             if self.compareNow(curtime):
                 if self.checktitle(query, title):
                     #Logger.getlogging().info(title)
                     urllist.append(url)
                 else:
                     Logger.log(url,
                                constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             else:
                 Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
         except:
             Logger.printexception()
     # 获取最终url列表
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
 def step1(self, params):
     # 搜索页面单个视频
     info = params.customized['query']
     keyvalue = Common.trydecode(info)
     soup = BeautifulSoup(params.content, 'html5lib')
     page_numlist = soup.select('#sort > .page > a')
     if soup.select_one('.no-result'):
         Logger.log(params.originalurl,
                    constant.ERRORCODE_WARNNING_NORESULTS)
         return
     if page_numlist:
         page_num = int(page_numlist[-2].get_text())
     else:
         page_num = 1
     if page_num >= self.maxpages:
         page_num = self.maxpages
     querylist = []
     for page in range(1, page_num + 1):
         if page == 1:
             self.step2(params)
             continue
         url = S2Query.S2_URL.format(key=keyvalue, page=page)
         querylist.append(url)
     self.__storeqeuryurllist__(querylist, S2Query.STEP_2, {
         'query': info,
         'page_num': page_num
     })
Esempio n. 29
0
    def step2(self, params):
        keyword = params.customized['keyword']
        query = Common.urldec(keyword)
        jsondata = json.loads(params.content)
        # 获取分页数
        html = jsondata['html']
        soup = bs(html, 'html5lib')
        videoUrlList = []

        videoList = soup.select('li.video')
        for video in videoList:
            try:
                videoUrl = 'https:' + video.select_one('a').get('href')
                videoUrl = videoUrl.split('?')[0] + '/'
                title = video.select_one('a').get('title')
                pubtime = video.find(attrs={
                    'class': 'so-icon time'
                }).get_text().strip()
                if self.compareNow(TimeUtility.getuniformtime(pubtime)):
                    if self.checktitle(query, title):
                        videoUrlList.append(videoUrl)
                        self.__storeurl__(videoUrl, pubtime,
                                          SPIDER_S2_WEBSITE_VIDEO)
                    else:
                        Logger.log(videoUrl,
                                   constant.ERRORCODE_WARNNING_NOMATCHTITLE)
                else:
                    Logger.log(videoUrl,
                               constant.ERRORCODE_WARNNING_NOMATCHTIME)
            except:
                Logger.printexception()
 def query(self, info):
     q = Common.urlenc(info)
     urls = [One7173S2Query.V17173_QUERY_P.format(q=q, ps=0)]
     self.__storeqeuryurllist__(urls, self.FIRST, {
         'query': q,
         'pages_num': 0
     })