Exemple #1
0
 def __init__(self, startUrl=None):
     self.startUrl = startUrl
     path, file = os.path.split(configs.proxy.srcname)
     if not os.path.exists(path):
         os.makedirs(path)
     if startUrl == None:
         self.startUrl = 'http://www.baidu.com'
     if configs.proxy.rescrab:
         Logger.info('rescrab proxylist...')
         self.getFreeProxy()
     if configs.proxy.retest:
         Logger.info('retest proxy list...')
         self.testProxy()
     if not os.path.exists(configs.proxy.srcname):
         self.loadDefaultProxy()
     else:
         self.proxyList = self.loadProxy()
     self.proxyList = list(
         filter(lambda x: abs(int(x['available'])) == 1, self.proxyList))
     self.proxyList = list(
         filter(lambda x: float(x['ping']) < 2, self.proxyList))
     if len(self.proxyList) == 0:
         Logger.critical(
             'There is no available proxy! espider is shuting down...')
         exit(1)
     self.proxyList.sort(
         key=lambda x: 1000 if float(x['ping']) == -1 else float(x['ping']))
     self.proxyCount = 0
Exemple #2
0
 def loadProxy(self):
     data = readLinesFile(configs.proxy.srcname)
     if data == None:
         Logger.critical('cannot load proxy list, espider is shuting down...')
         exit(1)
     proxyList = []
     for i in range(len(data)):
         proxyList.append(dict(zip(('type', 'ip', 'port', 'available', 'ping'), data[i].split('\t'))))
     return proxyList
Exemple #3
0
 def catalogueUrlRecursion(self, url):
     if configs.spider.catalogueLimit != 'inf':
         if self.catalogueCount >= configs.spider.catalogueLimit:
             return
     url = urljoin(self.host, url)
     urllistContent = []
     urllistCatalogue = []
     for i in range(configs.spider.retry):
         response = self.httpHandler.getResponseByUrl(url)
         if response == None:
             Logger.warning(
                 'cannot get url %s. please check httphandler...' % url)
             return
         response = EsResponse(response)
         try:
             urllistCatalogue, urllistContent = self.getUrlList(response)
             break
         except ValueError:
             Logger.critical(
                 'please verify your getUrlList() return 2 lists. espider is shutting down...'
             )
             exit(1)
         except Exception as e:
             Logger.error(
                 'an error occured in getUrlList(). if this take place often, please check your code'
             )
             self.httpHandler.nextHandler()
             if i == configs.spider.retry - 1:
                 self.uncatchableUrlList.append(url)
                 self.saveUncatchableUrl()
     if (len(urllistContent) != 0):
         for item in urllistContent:
             self.contentCount = self.contentCount + 1
             if configs.spider.contentLimit != 'inf':
                 if self.contentCount > configs.spider.contentLimit:
                     break
             if not keyValueInDictList('contentUrl', item,
                                       self.contentDictList):
                 Logger.debug('discover content url %s' % item)
                 dictTemp = {}
                 dictTemp['contentUrl'] = item
                 self.contentDictList.append(dictTemp)
     if len(urllistCatalogue) == 0:
         return
     else:
         for item in urllistCatalogue:
             if not item in self.catalogueUrl:
                 if configs.spider.catalogueLimit != 'inf':
                     if self.catalogueCount >= configs.spider.catalogueLimit:
                         return
                 Logger.info('get catalogue url %s' % item)
                 self.catalogueUrl.add(item)
                 self.catalogueCount = self.catalogueCount + 1
                 time.sleep(random.random() * configs.http.sleeptime)
                 self.catalogueUrlRecursion(item)
         return
Exemple #4
0
 def __init__(self):
     Logger.info('Espider %s initiating...' % self.espiderName)
     if self.startUrl == '':
         Logger.critical('Your espider should have a startUrl! Espider is shutting down...')
         exit(1)
     self.startUrl = urlunparse(urlparse(self.startUrl, 'http'))
     if urlparse(self.startUrl).hostname == None:
         Logger.critical('Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...')
         exit(1)
     self.host = urlparse(self.startUrl).scheme + '://' + urlparse(self.startUrl).hostname
     self.checkUrlQuery()
     self.httpHandler = HttpHandler(self.host)
Exemple #5
0
 def loadProxy(self):
     data = readLinesFile(configs.proxy.srcname)
     if data == None:
         Logger.critical(
             'cannot load proxy list, espider is shuting down...')
         exit(1)
     proxyList = []
     for i in range(len(data)):
         proxyList.append(
             dict(
                 zip(('type', 'ip', 'port', 'available', 'ping'),
                     data[i].split('\t'))))
     return proxyList
Exemple #6
0
 def catalogueUrlRecursion(self, url):
     if configs.spider.catalogueLimit != 'inf':
         if self.catalogueCount >= configs.spider.catalogueLimit:
             return
     url = urljoin(self.host, url)
     urllistContent = []
     urllistCatalogue = []
     for i in range(configs.spider.retry):
         response = self.httpHandler.getResponseByUrl(url)
         if response == None:
             Logger.warning('cannot get url %s. please check httphandler...' % url)
             return
         response = EsResponse(response)
         try:
             urllistCatalogue, urllistContent = self.getUrlList(response)               
             break
         except ValueError:
             Logger.critical('please verify your getUrlList() return 2 lists. espider is shutting down...')
             exit(1)
         except Exception as e:
             Logger.error('an error occured in getUrlList(). if this take place often, please check your code')
             self.httpHandler.nextHandler()
             if i == configs.spider.retry - 1:
                 self.uncatchableUrlList.append(url)
                 self.saveUncatchableUrl()
     if(len(urllistContent) != 0):
         for item in urllistContent:
             self.contentCount = self.contentCount + 1
             if configs.spider.contentLimit != 'inf':
                 if self.contentCount > configs.spider.contentLimit:
                     break
             if not keyValueInDictList('contentUrl', item, self.contentDictList):
                 Logger.debug('discover content url %s' % item)
                 dictTemp = {}
                 dictTemp['contentUrl'] = item
                 self.contentDictList.append(dictTemp)
     if len(urllistCatalogue) == 0:
         return
     else:
         for item in urllistCatalogue:
             if not item in self.catalogueUrl:
                 if configs.spider.catalogueLimit != 'inf':
                     if self.catalogueCount >= configs.spider.catalogueLimit:
                         return
                 Logger.info('get catalogue url %s' % item)
                 self.catalogueUrl.add(item)
                 self.catalogueCount = self.catalogueCount + 1
                 time.sleep(random.random() * configs.http.sleeptime)
                 self.catalogueUrlRecursion(item)
         return
Exemple #7
0
 def catalogueUrlRecursion(self, param, path, level):
     if not os.path.exists(path):
         os.makedirs(path)
     Logger.info('(level %s)start to scrab param:%s' % (level, param))
     if not isinstance(self.queryList[level - 1], list):
         self.queryList[level - 1] = [self.queryList[level - 1]]
     for query in self.queryList[level - 1]:
         url = self.buildUrl(query, param)
         url, headers = self.buildExtraHeaders(url)
         response = self.httpHandler.getResponseByUrl(url, headers=headers)
         data, type = self.contentResponseHandle(response)
         with open(path + 'data_query=' + query + '.' + type,
                   'w+',
                   encoding='utf8') as f:
             f.write(data)
         if level == self.level:
             return
         try:
             nextParamList = self.contentHandler(data)
         except Exception:
             Logger.error(
                 'an error occured in contentHandler(). If this take place often, please shut espider down...'
             )
             nextParamList = None
         if nextParamList == None or nextParamList == []:
             return
         if not isinstance(nextParamList, list):
             Logger.critical(
                 'contentHandler() should return a list. Espider is shutting down...'
             )
             exit(1)
         if not isinstance(nextParamList[0], dict):
             Logger.critical(
                 'contentHandler() should return list made by dict of each element. Espider is shutting down...'
             )
             exit(1)
         writeLinesFile(path + 'param_query=' + query + '.txt',
                        nextParamList)
         for nextParam in nextParamList:
             for k, v in nextParam.items():
                 if k in self.parameterList[level]:
                     nextParamDict = dict(param)
                     nextParamDict[k] = v
                     nextPath = path + k + '=' + v + '/'
                     time.sleep(random.random() * configs.http.sleeptime)
                     self.catalogueUrlRecursion(nextParamDict, nextPath,
                                                level + 1)
                 else:
                     pass
Exemple #8
0
 def startEspider(self):
     Logger.info('starting espider...')
     paramList = readLinesFile(configs.spider.contentdatapath + 'param.txt')
     if paramList == None:
         Logger.critical('You should create starting parameters in %s' % (configs.spider.contentdatapath + 'param.txt'))
         exit(1)
     for i in range(len(paramList)):
         paramList[i] = json.loads(paramList[i])
         for k,v in paramList[i].items():
             if k in self.parameterList[0]:
                 param = {}
                 param[k] = v
                 path = configs.spider.contentdatapath + k + '=' + v + '/'
                 self.catalogueUrlRecursion(param, path, 1)
             else:
                 Logger.error('param.txt gives an incorrect key compared to self.paramterList...')
Exemple #9
0
 def __init__(self):
     Logger.info('Espider %s initiating...' % self.espiderName)
     if self.startUrl == '':
         Logger.critical(
             'Your espider should have a startUrl! Espider is shutting down...'
         )
         exit(1)
     self.startUrl = urlunparse(urlparse(self.startUrl, 'http'))
     if urlparse(self.startUrl).hostname == None:
         Logger.critical(
             'Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...'
         )
         exit(1)
     self.host = urlparse(self.startUrl).scheme + '://' + urlparse(
         self.startUrl).hostname
     self.checkUrlQuery()
     self.httpHandler = HttpHandler(self.host)
Exemple #10
0
 def __init__(self):
     Logger.info('espider %s initiating...' % self.espiderName)
     if self.startUrl == '' or self.espiderName == '':
         Logger.critical('Your espider should have an espiderName and a startUrl! Espider is shutting down...')
         exit(1)
     self.startUrl = urlunparse(urlparse(self.startUrl, 'http'))
     if urlparse(self.startUrl).hostname == None:
         Logger.critical('Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...')
         exit(1)
     self.host = urlparse(self.startUrl).scheme + '://' + urlparse(self.startUrl).hostname
     self.httpHandler = HttpHandler(self.host)
     if not os.path.exists(configs.spider.pipelinepath):
         os.makedirs(configs.spider.pipelinepath)
     self.catalogueUrl = set()
     self.catalogueCount = 0
     self.contentCount = 0
     self.contentDictList = []
     self.uncatchableUrlList = []
Exemple #11
0
 def startEspider(self):
     Logger.info('starting espider...')
     paramList = readLinesFile(configs.spider.contentdatapath + 'param.txt')
     if paramList == None:
         Logger.critical('You should create starting parameters in %s' %
                         (configs.spider.contentdatapath + 'param.txt'))
         exit(1)
     for i in range(len(paramList)):
         paramList[i] = json.loads(paramList[i])
         for k, v in paramList[i].items():
             if k in self.parameterList[0]:
                 param = {}
                 param[k] = v
                 path = configs.spider.contentdatapath + k + '=' + v + '/'
                 self.catalogueUrlRecursion(param, path, 1)
             else:
                 Logger.error(
                     'param.txt gives an incorrect key compared to self.paramterList...'
                 )
Exemple #12
0
 def addDataItem(self, item, primaryKey):
     itemtemp = OrderedDict()
     for k,v in item.items():
         if isinstance(v, list):
             if len(v) == 0:
                 itemtemp[k] = ''
             else:
                 itemtemp[k] = v[0]
         else:
             itemtemp[k] = v
     if primaryKey != None and primaryKey in itemtemp:
         if itemtemp[primaryKey] not in self.primaryValue:
             if self.primaryKey == None:
                 self.primaryKey = primaryKey
             elif self.primaryKey != primaryKey:
                 Logger.critical('different primary key found in returned data. espider is shutting down...')
                 exit(1)
             self.primaryValue.append(itemtemp[primaryKey])
             self.dataList.append(itemtemp)
         return
     self.dataList.append(itemtemp)
Exemple #13
0
 def checkUrlQuery(self):
     if not isinstance(self.queryList, list) or len(self.queryList) == 0:
         Logger.critical('Please define queryList as a non-empty list! Espider is shutting down...')
         exit(1)
     if not isinstance(self.parameterList, list) or len(self.parameterList) == 0:
         Logger.critical('Please define parameterList as a non-empth list! Espider is shutting down...')
         exit(1)
     if not isinstance(self.extraParameter, OrderedDict):
         Logger.critical('extraParameter should be OrderedDict! Espider is shutting down')
         exit(1)
     if len(self.queryList) != len(self.parameterList):
         Logger.critical('Different length of queryList and parameterList, please make sure they match each other. Espider is shutting down...')
         exit(1)
     self.level = len(self.queryList)
Exemple #14
0
 def catalogueUrlRecursion(self, param, path, level):
     if not os.path.exists(path):
         os.makedirs(path)
     Logger.info('(level %s)start to scrab param:%s' % (level, param))
     if not isinstance(self.queryList[level - 1], list):
         self.queryList[level - 1] = [self.queryList[level - 1]]
     for query in self.queryList[level - 1]:
         url = self.buildUrl(query, param)
         url, headers = self.buildExtraHeaders(url)
         response = self.httpHandler.getResponseByUrl(url, headers=headers)
         data, type = self.contentResponseHandle(response)
         with open(path + 'data_query=' + query + '.' + type, 'w+', encoding='utf8') as f:
             f.write(data)
         if level == self.level:
             return
         try:
             nextParamList = self.contentHandler(data)
         except Exception:
             Logger.error('an error occured in contentHandler(). If this take place often, please shut espider down...')
             nextParamList = None
         if nextParamList == None or nextParamList == []:
             return
         if not isinstance(nextParamList, list):
             Logger.critical('contentHandler() should return a list. Espider is shutting down...')
             exit(1)
         if not isinstance(nextParamList[0], dict):
             Logger.critical('contentHandler() should return list made by dict of each element. Espider is shutting down...')
             exit(1)
         writeLinesFile(path + 'param_query=' + query + '.txt', nextParamList)
         for nextParam in nextParamList:
             for k,v in nextParam.items():
                 if k in self.parameterList[level]:
                     nextParamDict = dict(param)
                     nextParamDict[k] = v
                     nextPath = path + k + '=' + v + '/'
                     time.sleep(random.random() * configs.http.sleeptime)
                     self.catalogueUrlRecursion(nextParamDict, nextPath, level + 1)
                 else:
                     pass
Exemple #15
0
 def __init__(self):
     Logger.info('espider %s initiating...' % self.espiderName)
     if self.startUrl == '' or self.espiderName == '':
         Logger.critical(
             'Your espider should have an espiderName and a startUrl! Espider is shutting down...'
         )
         exit(1)
     self.startUrl = urlunparse(urlparse(self.startUrl, 'http'))
     if urlparse(self.startUrl).hostname == None:
         Logger.critical(
             'Illegal url! Please make sure url like "http://www.baidu.com". Espider will be closed...'
         )
         exit(1)
     self.host = urlparse(self.startUrl).scheme + '://' + urlparse(
         self.startUrl).hostname
     self.httpHandler = HttpHandler(self.host)
     if not os.path.exists(configs.spider.pipelinepath):
         os.makedirs(configs.spider.pipelinepath)
     self.catalogueUrl = set()
     self.catalogueCount = 0
     self.contentCount = 0
     self.contentDictList = []
     self.uncatchableUrlList = []
Exemple #16
0
 def __init__(self, startUrl = None):
     self.startUrl = startUrl
     path, file = os.path.split(configs.proxy.srcname)
     if not os.path.exists(path):
         os.makedirs(path)
     if startUrl == None:
         self.startUrl = 'http://www.baidu.com'
     if configs.proxy.rescrab:
         Logger.info('rescrab proxylist...')
         self.getFreeProxy()
     if configs.proxy.retest:
         Logger.info('retest proxy list...')
         self.testProxy()
     if not os.path.exists(configs.proxy.srcname):
         self.loadDefaultProxy()
     else:
         self.proxyList = self.loadProxy()
     self.proxyList = list(filter(lambda x:abs(int(x['available'])) == 1, self.proxyList))
     self.proxyList = list(filter(lambda x:float(x['ping']) < 2, self.proxyList))
     if len(self.proxyList) == 0:
         Logger.critical('There is no available proxy! espider is shuting down...')
         exit(1)
     self.proxyList.sort(key = lambda x:1000 if float(x['ping']) == -1 else float(x['ping']))
     self.proxyCount = 0
Exemple #17
0
 def checkUrlQuery(self):
     if not isinstance(self.queryList, list) or len(self.queryList) == 0:
         Logger.critical(
             'Please define queryList as a non-empty list! Espider is shutting down...'
         )
         exit(1)
     if not isinstance(self.parameterList, list) or len(
             self.parameterList) == 0:
         Logger.critical(
             'Please define parameterList as a non-empth list! Espider is shutting down...'
         )
         exit(1)
     if not isinstance(self.extraParameter, OrderedDict):
         Logger.critical(
             'extraParameter should be OrderedDict! Espider is shutting down'
         )
         exit(1)
     if len(self.queryList) != len(self.parameterList):
         Logger.critical(
             'Different length of queryList and parameterList, please make sure they match each other. Espider is shutting down...'
         )
         exit(1)
     self.level = len(self.queryList)
Exemple #18
0
 def parseContent(self, data):
     """
         data is a bytes type variable, you should return a list with each element of dict type
     """
     Logger.critical('parseContent() without override! espider is shuting down...')
     exit(1)
Exemple #19
0
 def getUrlList(self, response):
     Logger.critical(
         'getUrlList() without override! espider is shuting down...')
     exit(1)
Exemple #20
0
 def contentHandler(self, data):
     Logger.critical(
         'contentHandler() without override! espider is shuting down...')
     exit(1)
Exemple #21
0
 def contentHandler(self, data):
     Logger.critical('contentHandler() without override! espider is shuting down...')
     exit(1)
Exemple #22
0
 def getUrlList(self, response):
     Logger.critical('getUrlList() without override! espider is shuting down...')
     exit(1)