def startParseContent(self): self.fileList = getFileList(self.contentPath) if len(self.fileList) == 0: Logger.warning('There is no %s file in %s, please have a check' %(self.contentType, self.contentPath)) return self.fileListFilter() Logger.info('starting parsing content...') if len(self.fileList) == 0: return dataDict = [] for i in range(len(self.fileList)): try: try: with open(self.fileList[i], self.openMethod, encoding=self.openEncoding) as f: data = f.read() dataDict = self.parseContent(data) except OSError: Logger.error('an error occured when open %s' %self.fileList[i]) continue except Exception: Logger.error('an error occured when parsing content. If this takes place very often, please check your parseContent()...') continue if not isinstance(dataDict, list): Logger.error('please make sure parseContent() returns an list-like object') continue if len(dataDict) == 0: continue for item in dataDict: if not isinstance(item, dict): Logger.error('please make sure parseContent() returns dict-like objects in each element of a list. if this occur often, please teminate progress...') self.addDataItem(item, self.primaryKey) self.saveData() Logger.info('parsing content done')
def setParser(self, parser): if configs.spider.mode == 'override': Logger.warning('Spider mode is override in configs. setParse() will be ignored. If you want to use update mode, change it in config_override') return if not isinstance(parser, BaseParser): Logger.error('setParser() should have a BaseParser-like object input. Spider will scribe in override instead of update mode') return self.parser = parser
def contentHandler(self, url, count): url = urljoin(self.host, url) Logger.info('(%s%%)get content data from %s' % (round(100 * count / len(self.contentDictList), 2), url)) data = None type = '' name = None for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning('cannot get url %s. please check httphandler...' % url) return ('disabled', 'disabled') response = EsResponse(response) try: data, type = self.contentResponseHandle(response) if data == None: Logger.debug('data == None') raise Exception name = self.contentFileName(response) except Exception: Logger.error('an error occured in getUrlList(). if this take place very often, please check your code') self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() continue break if data == None: return ('disabled', 'disabled') if name == None: name = '%s.' % count + type if not os.path.exists(configs.spider.contentdatapath): os.makedirs(configs.spider.contentdatapath) if self.parser == None: MD5 = buildMD5String(data) else: try: parsedData = '%s' %self.parser.parseContent(data) MD5 = buildMD5String(parsedData) except Exception: Logger.error('An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5') MD5 = buildMD5String(data) filepath = configs.spider.contentdatapath + name try: if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css': with open(filepath, 'w+', encoding='utf8') as f: f.write(data) return (MD5, filepath) if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4': with open(filepath, 'wb+') as f: f.write(data) return (MD5, filepath) with open(filepath, 'wb+') as f: f.write(data) except OSError: Logger.error('anerrer occured when open %s' % configs.spider.contentdatapath + name) return (MD5, filepath)
def catalogueUrlRecursion(self, url): if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return url = urljoin(self.host, url) urllistContent = [] urllistCatalogue = [] for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning( 'cannot get url %s. please check httphandler...' % url) return response = EsResponse(response) try: urllistCatalogue, urllistContent = self.getUrlList(response) break except ValueError: Logger.critical( 'please verify your getUrlList() return 2 lists. espider is shutting down...' ) exit(1) except Exception as e: Logger.error( 'an error occured in getUrlList(). if this take place often, please check your code' ) self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() if (len(urllistContent) != 0): for item in urllistContent: self.contentCount = self.contentCount + 1 if configs.spider.contentLimit != 'inf': if self.contentCount > configs.spider.contentLimit: break if not keyValueInDictList('contentUrl', item, self.contentDictList): Logger.debug('discover content url %s' % item) dictTemp = {} dictTemp['contentUrl'] = item self.contentDictList.append(dictTemp) if len(urllistCatalogue) == 0: return else: for item in urllistCatalogue: if not item in self.catalogueUrl: if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return Logger.info('get catalogue url %s' % item) self.catalogueUrl.add(item) self.catalogueCount = self.catalogueCount + 1 time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(item) return
def __init__(self, contentType, primaryKey = None, contentPath = configs.spider.contentdatapath, openMethod = 'rb', openEncoding = None): if self.parserName == '': Logger.warning('You should define parserName for your parser! Espider is shutting down...') exit(1) self.contentType = contentType self.contentPath = contentPath self.openMethod = openMethod self.openEncoding = openEncoding self.dataList = [] self.primaryValue = [] self.primaryKey = primaryKey self.contentPath = contentPath
def setParser(self, parser): if configs.spider.mode == 'override': Logger.warning( 'Spider mode is override in configs. setParse() will be ignored. If you want to use update mode, change it in config_override' ) return if not isinstance(parser, BaseParser): Logger.error( 'setParser() should have a BaseParser-like object input. Spider will scribe in override instead of update mode' ) return self.parser = parser
def catalogueUrlRecursion(self, url): if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return url = urljoin(self.host, url) urllistContent = [] urllistCatalogue = [] for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning('cannot get url %s. please check httphandler...' % url) return response = EsResponse(response) try: urllistCatalogue, urllistContent = self.getUrlList(response) break except ValueError: Logger.critical('please verify your getUrlList() return 2 lists. espider is shutting down...') exit(1) except Exception as e: Logger.error('an error occured in getUrlList(). if this take place often, please check your code') self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() if(len(urllistContent) != 0): for item in urllistContent: self.contentCount = self.contentCount + 1 if configs.spider.contentLimit != 'inf': if self.contentCount > configs.spider.contentLimit: break if not keyValueInDictList('contentUrl', item, self.contentDictList): Logger.debug('discover content url %s' % item) dictTemp = {} dictTemp['contentUrl'] = item self.contentDictList.append(dictTemp) if len(urllistCatalogue) == 0: return else: for item in urllistCatalogue: if not item in self.catalogueUrl: if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return Logger.info('get catalogue url %s' % item) self.catalogueUrl.add(item) self.catalogueCount = self.catalogueCount + 1 time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(item) return
def contentHandler(self, url, count): url = urljoin(self.host, url) Logger.info('(%s%%)get content data from %s' % (round(100 * count / len(self.contentDictList), 2), url)) data = None type = '' name = None for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning( 'cannot get url %s. please check httphandler...' % url) return ('disabled', 'disabled') response = EsResponse(response) try: data, type = self.contentResponseHandle(response) if data == None: Logger.debug('data == None') raise Exception name = self.contentFileName(response) except Exception: Logger.error( 'an error occured in getUrlList(). if this take place very often, please check your code' ) self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() continue break if data == None: return ('disabled', 'disabled') if name == None: name = '%s.' % count + type if not os.path.exists(configs.spider.contentdatapath): os.makedirs(configs.spider.contentdatapath) if self.parser == None: MD5 = buildMD5String(data) else: try: parsedData = '%s' % self.parser.parseContent(data) MD5 = buildMD5String(parsedData) except Exception: Logger.error( 'An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5' ) MD5 = buildMD5String(data) filepath = configs.spider.contentdatapath + name try: if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css': with open(filepath, 'w+', encoding='utf8') as f: f.write(data) return (MD5, filepath) if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4': with open(filepath, 'wb+') as f: f.write(data) return (MD5, filepath) with open(filepath, 'wb+') as f: f.write(data) except OSError: Logger.error('anerrer occured when open %s' % configs.spider.contentdatapath + name) return (MD5, filepath)