Esempio n. 1
0
 def contentHandler(self, url, count):
     url = urljoin(self.host, url)
     Logger.info('(%s%%)get content data from %s' % (round(100 * count / len(self.contentDictList), 2), url))
     data = None
     type = ''
     name = None
     for i in range(configs.spider.retry):
         response = self.httpHandler.getResponseByUrl(url)
         if response == None:
             Logger.warning('cannot get url %s. please check httphandler...' % url)
             return ('disabled', 'disabled')
         response = EsResponse(response)
         try:
             data, type = self.contentResponseHandle(response)
             if data == None:
                 Logger.debug('data == None')
                 raise Exception
             
             name = self.contentFileName(response)
         except Exception:
             Logger.error('an error occured in getUrlList(). if this take place very often, please check your code')
             self.httpHandler.nextHandler()
             if i == configs.spider.retry - 1:
                 self.uncatchableUrlList.append(url)
                 self.saveUncatchableUrl()
             continue
         break
     if data == None:
         return ('disabled', 'disabled')
     if name == None:
         name = '%s.' % count + type
     if not os.path.exists(configs.spider.contentdatapath):
         os.makedirs(configs.spider.contentdatapath)
     if self.parser == None:
         MD5 = buildMD5String(data)
     else:
         try:
             parsedData = '%s' %self.parser.parseContent(data)
             MD5 = buildMD5String(parsedData)
         except Exception:
             Logger.error('An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5')
             MD5 = buildMD5String(data)
     filepath = configs.spider.contentdatapath + name
     try:
         if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css':
             with open(filepath, 'w+', encoding='utf8') as f:
                 f.write(data)
             return (MD5, filepath)
         if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4':
             with open(filepath, 'wb+') as f:
                 f.write(data)
             return (MD5, filepath)
         with open(filepath, 'wb+') as f:
             f.write(data)
     except OSError:
         Logger.error('anerrer occured when open %s' % configs.spider.contentdatapath + name)
     return (MD5, filepath)
Esempio n. 2
0
 def catalogueUrlRecursion(self, url):
     if configs.spider.catalogueLimit != 'inf':
         if self.catalogueCount >= configs.spider.catalogueLimit:
             return
     url = urljoin(self.host, url)
     urllistContent = []
     urllistCatalogue = []
     for i in range(configs.spider.retry):
         response = self.httpHandler.getResponseByUrl(url)
         if response == None:
             Logger.warning(
                 'cannot get url %s. please check httphandler...' % url)
             return
         response = EsResponse(response)
         try:
             urllistCatalogue, urllistContent = self.getUrlList(response)
             break
         except ValueError:
             Logger.critical(
                 'please verify your getUrlList() return 2 lists. espider is shutting down...'
             )
             exit(1)
         except Exception as e:
             Logger.error(
                 'an error occured in getUrlList(). if this take place often, please check your code'
             )
             self.httpHandler.nextHandler()
             if i == configs.spider.retry - 1:
                 self.uncatchableUrlList.append(url)
                 self.saveUncatchableUrl()
     if (len(urllistContent) != 0):
         for item in urllistContent:
             self.contentCount = self.contentCount + 1
             if configs.spider.contentLimit != 'inf':
                 if self.contentCount > configs.spider.contentLimit:
                     break
             if not keyValueInDictList('contentUrl', item,
                                       self.contentDictList):
                 Logger.debug('discover content url %s' % item)
                 dictTemp = {}
                 dictTemp['contentUrl'] = item
                 self.contentDictList.append(dictTemp)
     if len(urllistCatalogue) == 0:
         return
     else:
         for item in urllistCatalogue:
             if not item in self.catalogueUrl:
                 if configs.spider.catalogueLimit != 'inf':
                     if self.catalogueCount >= configs.spider.catalogueLimit:
                         return
                 Logger.info('get catalogue url %s' % item)
                 self.catalogueUrl.add(item)
                 self.catalogueCount = self.catalogueCount + 1
                 time.sleep(random.random() * configs.http.sleeptime)
                 self.catalogueUrlRecursion(item)
         return
Esempio n. 3
0
 def catalogueUrlRecursion(self, url):
     if configs.spider.catalogueLimit != 'inf':
         if self.catalogueCount >= configs.spider.catalogueLimit:
             return
     url = urljoin(self.host, url)
     urllistContent = []
     urllistCatalogue = []
     for i in range(configs.spider.retry):
         response = self.httpHandler.getResponseByUrl(url)
         if response == None:
             Logger.warning('cannot get url %s. please check httphandler...' % url)
             return
         response = EsResponse(response)
         try:
             urllistCatalogue, urllistContent = self.getUrlList(response)               
             break
         except ValueError:
             Logger.critical('please verify your getUrlList() return 2 lists. espider is shutting down...')
             exit(1)
         except Exception as e:
             Logger.error('an error occured in getUrlList(). if this take place often, please check your code')
             self.httpHandler.nextHandler()
             if i == configs.spider.retry - 1:
                 self.uncatchableUrlList.append(url)
                 self.saveUncatchableUrl()
     if(len(urllistContent) != 0):
         for item in urllistContent:
             self.contentCount = self.contentCount + 1
             if configs.spider.contentLimit != 'inf':
                 if self.contentCount > configs.spider.contentLimit:
                     break
             if not keyValueInDictList('contentUrl', item, self.contentDictList):
                 Logger.debug('discover content url %s' % item)
                 dictTemp = {}
                 dictTemp['contentUrl'] = item
                 self.contentDictList.append(dictTemp)
     if len(urllistCatalogue) == 0:
         return
     else:
         for item in urllistCatalogue:
             if not item in self.catalogueUrl:
                 if configs.spider.catalogueLimit != 'inf':
                     if self.catalogueCount >= configs.spider.catalogueLimit:
                         return
                 Logger.info('get catalogue url %s' % item)
                 self.catalogueUrl.add(item)
                 self.catalogueCount = self.catalogueCount + 1
                 time.sleep(random.random() * configs.http.sleeptime)
                 self.catalogueUrlRecursion(item)
         return
Esempio n. 4
0
 def getResponseByUrl(self, url, headers={}):
     """
         url is the website you want.
         headers is the dict you add dynamicly apart from that in configs
     """
     begin = time.clock()
     if urlparse(url).hostname == None:
         Logger.error('url of request illegal! which is %s' % url)
         return None
     req = urllib.request.Request(url)
     for k, v in configs.urlrequest.items():
         if isinstance(v, list):
             l = len(v)
             v = v[randint(0, len(v) - 1)]
         req.add_header(k, v)
     for k, v in headers.items():
         req.add_header(k, v)
     flag = False
     for i in range(configs.http.retry):
         Logger.debug('%s attempt' % (i + 1))
         try:
             if self.selephan != None:
                 response = self.selephan.getReqWithSel(req)
                 if response == None:
                     continue
                 else:
                     flag = True
                     break
             if self.proxy != None:
                 response = self.proxy.getReqWithProxy(
                     req, timeout=configs.proxy.timeout)
                 if response == None:
                     continue
                 else:
                     flag = True
                     break
             response = urllib.request.urlopen(req,
                                               timeout=configs.http.timeout)
             flag = True
             break
         except Exception as e:
             continue
     end = time.clock()
     Logger.debug('HTTP request time: %ss' % (end - begin))
     if flag:
         return response
     else:
         return None
Esempio n. 5
0
 def getResponseByUrl(self, url, headers={}):
     """
         url is the website you want.
         headers is the dict you add dynamicly apart from that in configs
     """
     begin = time.clock()
     if urlparse(url).hostname == None:
         Logger.error('url of request illegal! which is %s' %url)
         return None
     req = urllib.request.Request(url)
     for k,v in configs.urlrequest.items():
         if isinstance(v, list):
             l = len(v)
             v = v[randint(0, len(v) - 1)]
         req.add_header(k,v)
     for k,v in headers.items():
         req.add_header(k,v)
     flag = False
     for i in range(configs.http.retry):
         Logger.debug('%s attempt' %(i+1))
         try:
             if self.selephan != None:
                 response = self.selephan.getReqWithSel(req)
                 if response == None:
                     continue
                 else:
                     flag = True
                     break
             if self.proxy != None:
                 response= self.proxy.getReqWithProxy(req, timeout=configs.proxy.timeout)
                 if response == None:
                     continue
                 else:
                     flag = True
                     break
             response = urllib.request.urlopen(req, timeout=configs.http.timeout)
             flag = True
             break
         except Exception as e:
             continue
     end = time.clock()
     Logger.debug('HTTP request time: %ss' %(end - begin))
     if flag:
         return response
     else:
         return None
Esempio n. 6
0
    def contentHandler(self, url, count):
        url = urljoin(self.host, url)
        Logger.info('(%s%%)get content data from %s' %
                    (round(100 * count / len(self.contentDictList), 2), url))
        data = None
        type = ''
        name = None
        for i in range(configs.spider.retry):
            response = self.httpHandler.getResponseByUrl(url)
            if response == None:
                Logger.warning(
                    'cannot get url %s. please check httphandler...' % url)
                return ('disabled', 'disabled')
            response = EsResponse(response)
            try:
                data, type = self.contentResponseHandle(response)
                if data == None:
                    Logger.debug('data == None')
                    raise Exception

                name = self.contentFileName(response)
            except Exception:
                Logger.error(
                    'an error occured in getUrlList(). if this take place very often, please check your code'
                )
                self.httpHandler.nextHandler()
                if i == configs.spider.retry - 1:
                    self.uncatchableUrlList.append(url)
                    self.saveUncatchableUrl()
                continue
            break
        if data == None:
            return ('disabled', 'disabled')
        if name == None:
            name = '%s.' % count + type
        if not os.path.exists(configs.spider.contentdatapath):
            os.makedirs(configs.spider.contentdatapath)
        if self.parser == None:
            MD5 = buildMD5String(data)
        else:
            try:
                parsedData = '%s' % self.parser.parseContent(data)
                MD5 = buildMD5String(parsedData)
            except Exception:
                Logger.error(
                    'An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5'
                )
                MD5 = buildMD5String(data)
        filepath = configs.spider.contentdatapath + name
        try:
            if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css':
                with open(filepath, 'w+', encoding='utf8') as f:
                    f.write(data)
                return (MD5, filepath)
            if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4':
                with open(filepath, 'wb+') as f:
                    f.write(data)
                return (MD5, filepath)
            with open(filepath, 'wb+') as f:
                f.write(data)
        except OSError:
            Logger.error('anerrer occured when open %s' %
                         configs.spider.contentdatapath + name)
        return (MD5, filepath)