def setHeaders(self, headers): if not isinstance(headers, dict): Logger.error( 'PhantomJsResponse setHeaders() error: headers is not a dict...' ) return self.headers = headers
def saveData(self): if configs.parse.file: try: dataList = [] try: if os.path.exists(configs.parse.contentpath + configs.parse.contentfile): dataList = readLinesFile(configs.parse.contentpath + configs.parse.contentfile) for i in range(len(dataList)): try: self.addDataItem(json.loads(dataList[i]), self.primaryKey) except Exception: pass except Exception: pass dataList = [] for item in self.dataList: dataList.append(json.dumps(item, ensure_ascii = False)) writeLinesFile(configs.parse.contentpath + configs.parse.contentfile, dataList, method=configs.parse.savemethod) except Exception as e: Logger.error('an error occured while saving data to file...', e) if configs.parse.mysql: from espider.mysql import Mysql keyList = [] for k in self.dataList[0]: keyList.append(k) mySql = Mysql(configs.mysql.table, keyList, primaryKey=self.primaryKey) mySql.insertWithUpdate(self.dataList)
def getReqWithSel(self, request): if not isinstance(request, urllib.request.Request): Logger.error('SelePhan request error: please make sure request is a urllib.request.Request object...') return None url = request.full_url self.driver.get(url) response = PhantomJsResponse(self.driver.page_source, {'Content-Type':'text/html'}) return response
def setParser(self, parser): if configs.spider.mode == 'override': Logger.warning('Spider mode is override in configs. setParse() will be ignored. If you want to use update mode, change it in config_override') return if not isinstance(parser, BaseParser): Logger.error('setParser() should have a BaseParser-like object input. Spider will scribe in override instead of update mode') return self.parser = parser
def catalogueUrlRecursion(self, url): if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return url = urljoin(self.host, url) urllistContent = [] urllistCatalogue = [] for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning( 'cannot get url %s. please check httphandler...' % url) return response = EsResponse(response) try: urllistCatalogue, urllistContent = self.getUrlList(response) break except ValueError: Logger.critical( 'please verify your getUrlList() return 2 lists. espider is shutting down...' ) exit(1) except Exception as e: Logger.error( 'an error occured in getUrlList(). if this take place often, please check your code' ) self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() if (len(urllistContent) != 0): for item in urllistContent: self.contentCount = self.contentCount + 1 if configs.spider.contentLimit != 'inf': if self.contentCount > configs.spider.contentLimit: break if not keyValueInDictList('contentUrl', item, self.contentDictList): Logger.debug('discover content url %s' % item) dictTemp = {} dictTemp['contentUrl'] = item self.contentDictList.append(dictTemp) if len(urllistCatalogue) == 0: return else: for item in urllistCatalogue: if not item in self.catalogueUrl: if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return Logger.info('get catalogue url %s' % item) self.catalogueUrl.add(item) self.catalogueCount = self.catalogueCount + 1 time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(item) return
def backupUpdate(self): if not os.path.exists(configs.spider.contentfilename): return if not os.path.exists(configs.spider.contentbackuppath): os.makedirs(configs.spider.contentbackuppath) now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_') dstfilename = os.path.join(configs.spider.contentbackuppath, now + os.path.split(configs.spider.contentupdatefilename)[1]) try: shutil.copy2(configs.spider.contentupdatefilename, dstfilename) except IOError: Logger.error('Cannot copy file to update path...')
def checkKeyList(self, keyList): flag = True if len(self.keyList) != len(keyList): Logger.error('keyList length do not match...') return False for item in keyList: if item not in self.keyList: Logger.error('keyList element do not match') flag = False break return flag
def getReqWithSel(self, request): if not isinstance(request, urllib.request.Request): Logger.error( 'SelePhan request error: please make sure request is a urllib.request.Request object...' ) return None url = request.full_url self.driver.get(url) response = PhantomJsResponse(self.driver.page_source, {'Content-Type': 'text/html'}) return response
def setParser(self, parser): if configs.spider.mode == 'override': Logger.warning( 'Spider mode is override in configs. setParse() will be ignored. If you want to use update mode, change it in config_override' ) return if not isinstance(parser, BaseParser): Logger.error( 'setParser() should have a BaseParser-like object input. Spider will scribe in override instead of update mode' ) return self.parser = parser
def backupUpdate(self): if not os.path.exists(configs.spider.contentfilename): return if not os.path.exists(configs.spider.contentbackuppath): os.makedirs(configs.spider.contentbackuppath) now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_') dstfilename = os.path.join( configs.spider.contentbackuppath, now + os.path.split(configs.spider.contentupdatefilename)[1]) try: shutil.copy2(configs.spider.contentupdatefilename, dstfilename) except IOError: Logger.error('Cannot copy file to update path...')
def __init__(self, response): self.data = b'' self.headers = [] self.code = '' if response == None: return if not isinstance(response, HTTPResponse): Logger.error('EsRequest error: wrong type of response') return self.data = response.read() self.headers = response.getheaders() self.code = response.getcode() self.url = response.geturl()
def loadContentUpdateFileList(self): if not os.path.exists(configs.spider.contentupdatefilename): return [] dataList = readLinesFile(configs.spider.contentupdatefilename) fileList = [] try: for item in dataList: if item.startswith('#'): continue data = item.split('\t') fileList.append(data[3]) except IndexError: Logger.error('Loading contentupdatefile error!') return fileList
def startParseContent(self): self.fileList = getFileList(self.contentPath) if len(self.fileList) == 0: Logger.warning('There is no %s file in %s, please have a check' %(self.contentType, self.contentPath)) return self.fileListFilter() Logger.info('starting parsing content...') if len(self.fileList) == 0: return dataDict = [] for i in range(len(self.fileList)): try: try: with open(self.fileList[i], self.openMethod, encoding=self.openEncoding) as f: data = f.read() dataDict = self.parseContent(data) except OSError: Logger.error('an error occured when open %s' %self.fileList[i]) continue except Exception: Logger.error('an error occured when parsing content. If this takes place very often, please check your parseContent()...') continue if not isinstance(dataDict, list): Logger.error('please make sure parseContent() returns an list-like object') continue if len(dataDict) == 0: continue for item in dataDict: if not isinstance(item, dict): Logger.error('please make sure parseContent() returns dict-like objects in each element of a list. if this occur often, please teminate progress...') self.addDataItem(item, self.primaryKey) self.saveData() Logger.info('parsing content done')
def catalogueUrlRecursion(self, url): if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return url = urljoin(self.host, url) urllistContent = [] urllistCatalogue = [] for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning('cannot get url %s. please check httphandler...' % url) return response = EsResponse(response) try: urllistCatalogue, urllistContent = self.getUrlList(response) break except ValueError: Logger.critical('please verify your getUrlList() return 2 lists. espider is shutting down...') exit(1) except Exception as e: Logger.error('an error occured in getUrlList(). if this take place often, please check your code') self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() if(len(urllistContent) != 0): for item in urllistContent: self.contentCount = self.contentCount + 1 if configs.spider.contentLimit != 'inf': if self.contentCount > configs.spider.contentLimit: break if not keyValueInDictList('contentUrl', item, self.contentDictList): Logger.debug('discover content url %s' % item) dictTemp = {} dictTemp['contentUrl'] = item self.contentDictList.append(dictTemp) if len(urllistCatalogue) == 0: return else: for item in urllistCatalogue: if not item in self.catalogueUrl: if configs.spider.catalogueLimit != 'inf': if self.catalogueCount >= configs.spider.catalogueLimit: return Logger.info('get catalogue url %s' % item) self.catalogueUrl.add(item) self.catalogueCount = self.catalogueCount + 1 time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(item) return
def catalogueUrlRecursion(self, param, path, level): if not os.path.exists(path): os.makedirs(path) Logger.info('(level %s)start to scrab param:%s' % (level, param)) if not isinstance(self.queryList[level - 1], list): self.queryList[level - 1] = [self.queryList[level - 1]] for query in self.queryList[level - 1]: url = self.buildUrl(query, param) url, headers = self.buildExtraHeaders(url) response = self.httpHandler.getResponseByUrl(url, headers=headers) data, type = self.contentResponseHandle(response) with open(path + 'data_query=' + query + '.' + type, 'w+', encoding='utf8') as f: f.write(data) if level == self.level: return try: nextParamList = self.contentHandler(data) except Exception: Logger.error( 'an error occured in contentHandler(). If this take place often, please shut espider down...' ) nextParamList = None if nextParamList == None or nextParamList == []: return if not isinstance(nextParamList, list): Logger.critical( 'contentHandler() should return a list. Espider is shutting down...' ) exit(1) if not isinstance(nextParamList[0], dict): Logger.critical( 'contentHandler() should return list made by dict of each element. Espider is shutting down...' ) exit(1) writeLinesFile(path + 'param_query=' + query + '.txt', nextParamList) for nextParam in nextParamList: for k, v in nextParam.items(): if k in self.parameterList[level]: nextParamDict = dict(param) nextParamDict[k] = v nextPath = path + k + '=' + v + '/' time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(nextParamDict, nextPath, level + 1) else: pass
def startEspider(self): if configs.spider.mode != 'override' and configs.spider.mode != 'update': Logger.error( 'Please verify spider.mode is override or update in configs. Spider will run in default mode(override)' ) if configs.spider.mode == 'update' and self.parser == None: Logger.error( 'Spider cannot run in update mode without a correct function setParser() defined. ' ) Logger.info('Espider running in %s mode' % ('override' if self.parser == None else 'update')) if self.parser != None: # update mode self.backupUpdate() self.contentDictList = self.loadContentDictList() Logger.info('start to get catalogue urls...') if configs.spider.loadurllistfromfile: self.loadCatalogueList() self.contentDictList = self.loadContentDictList() else: self.catalogueUrlRecursion(self.startUrl) writeLinesFile(configs.spider.cataloguefilename, self.catalogueUrl, method='w+') count = 0 for item in self.contentDictList: count = count + 1 MD5, filepath = self.contentHandler(item['contentUrl'], count) item['filepath'] = filepath if 'MD5' in item: if self.parser == None: item['update'] = 'disabled' elif item['MD5'] == MD5: item['update'] = 'false' else: item['update'] = 'true' item['MD5'] = MD5 else: if self.parser == None: item['update'] = 'disabled' else: item['update'] = 'true' item['MD5'] = MD5 self.saveContentUrlDictList() self.saveContentUrlUpdate() Logger.info('espider complete the task!')
def readLinesFile(filename, method = 'r'): """ Read from a file and extract each line to the element of a list. """ dataInLine = [] try: with open(filename, method, encoding='utf8') as f: data = f.readlines() for i in range(len(data)): data[i] = data[i].strip() dataInLine = data except OSError: Logger.error('an errer occured when open %s' %filename) return None return dataInLine
def getResponseByUrl(self, url, headers={}): """ url is the website you want. headers is the dict you add dynamicly apart from that in configs """ begin = time.clock() if urlparse(url).hostname == None: Logger.error('url of request illegal! which is %s' % url) return None req = urllib.request.Request(url) for k, v in configs.urlrequest.items(): if isinstance(v, list): l = len(v) v = v[randint(0, len(v) - 1)] req.add_header(k, v) for k, v in headers.items(): req.add_header(k, v) flag = False for i in range(configs.http.retry): Logger.debug('%s attempt' % (i + 1)) try: if self.selephan != None: response = self.selephan.getReqWithSel(req) if response == None: continue else: flag = True break if self.proxy != None: response = self.proxy.getReqWithProxy( req, timeout=configs.proxy.timeout) if response == None: continue else: flag = True break response = urllib.request.urlopen(req, timeout=configs.http.timeout) flag = True break except Exception as e: continue end = time.clock() Logger.debug('HTTP request time: %ss' % (end - begin)) if flag: return response else: return None
def __sqlExecute(self,sql): data = None try: connection = pymysql.connect(host = configs.mysql.host, port = configs.mysql.port, user = configs.mysql.user, password = configs.mysql.password, db = configs.mysql.db, charset = 'utf8') try: with connection.cursor() as cur: cur.execute(sql) data = cur.fetchall() connection.commit() except Exception: Logger.error('sql statement execute error:%s' %sql) finally: connection.close() except Exception: Logger.error('mysql database open with error...') return data
def startEspider(self): Logger.info('starting espider...') paramList = readLinesFile(configs.spider.contentdatapath + 'param.txt') if paramList == None: Logger.critical('You should create starting parameters in %s' % (configs.spider.contentdatapath + 'param.txt')) exit(1) for i in range(len(paramList)): paramList[i] = json.loads(paramList[i]) for k,v in paramList[i].items(): if k in self.parameterList[0]: param = {} param[k] = v path = configs.spider.contentdatapath + k + '=' + v + '/' self.catalogueUrlRecursion(param, path, 1) else: Logger.error('param.txt gives an incorrect key compared to self.paramterList...')
def writeLinesFile(filename, dataInLine, method = 'w'): """ Write a list to the file. One element to one line. """ if not isinstance(dataInLine, Iterable): Logger.error('input illegal') return if not os.path.exists(os.path.split(filename)[0]): os.makedirs(os.path.split(filename)[0]) dataInLine = [str(line) + '\n' for line in dataInLine] try: with open(filename, method, encoding = 'utf8') as f: f.writelines(dataInLine) except OSError: Logger.error('an errer occured when open %s' %filename) return
def getResponseByUrl(self, url, headers={}): """ url is the website you want. headers is the dict you add dynamicly apart from that in configs """ begin = time.clock() if urlparse(url).hostname == None: Logger.error('url of request illegal! which is %s' %url) return None req = urllib.request.Request(url) for k,v in configs.urlrequest.items(): if isinstance(v, list): l = len(v) v = v[randint(0, len(v) - 1)] req.add_header(k,v) for k,v in headers.items(): req.add_header(k,v) flag = False for i in range(configs.http.retry): Logger.debug('%s attempt' %(i+1)) try: if self.selephan != None: response = self.selephan.getReqWithSel(req) if response == None: continue else: flag = True break if self.proxy != None: response= self.proxy.getReqWithProxy(req, timeout=configs.proxy.timeout) if response == None: continue else: flag = True break response = urllib.request.urlopen(req, timeout=configs.http.timeout) flag = True break except Exception as e: continue end = time.clock() Logger.debug('HTTP request time: %ss' %(end - begin)) if flag: return response else: return None
def createTable(self): if not isinstance(self.keyList, list): Logger.error('key list error when creating table %s' %self.table) return if len(self.keyList) == 0: Logger.error('key list without an element in it, cannot create table %s' %self.table) return keyList = list() for i in range(len(self.keyList)): keyList.append(self.keyList[i] + ' VARCHAR(255)') if self.primaryKey != None: if self.primaryKey == self.keyList[0]: keyList[0] = keyList[0] + ' PRIMARY KEY' temp = ','.join(keyList) sql = "CREATE TABLE IF NOT EXISTS %s(%s)" %(self.table, temp) self.__sqlExecute(sql) return
def loadContentDictList(self): if not os.path.exists(configs.spider.contentfilename): return [] dataList = readLinesFile(configs.spider.contentfilename) dataDictList = [] try: for item in dataList: if item.startswith('#'): continue t = {} data = item.split('\t') t['contentUrl'] = data[0] t['MD5'] = data[1] t['update'] = data[2] t['filepath'] = data[3] dataDictList.append(t) except IndexError: Logger.error('Loading contentfile error!') return dataDictList
def createTable(self): if not isinstance(self.keyList, list): Logger.error('key list error when creating table %s' % self.table) return if len(self.keyList) == 0: Logger.error( 'key list without an element in it, cannot create table %s' % self.table) return keyList = list() for i in range(len(self.keyList)): keyList.append(self.keyList[i] + ' VARCHAR(255)') if self.primaryKey != None: if self.primaryKey == self.keyList[0]: keyList[0] = keyList[0] + ' PRIMARY KEY' temp = ','.join(keyList) sql = "CREATE TABLE IF NOT EXISTS %s(%s)" % (self.table, temp) self.__sqlExecute(sql) return
def startEspider(self): Logger.info('starting espider...') paramList = readLinesFile(configs.spider.contentdatapath + 'param.txt') if paramList == None: Logger.critical('You should create starting parameters in %s' % (configs.spider.contentdatapath + 'param.txt')) exit(1) for i in range(len(paramList)): paramList[i] = json.loads(paramList[i]) for k, v in paramList[i].items(): if k in self.parameterList[0]: param = {} param[k] = v path = configs.spider.contentdatapath + k + '=' + v + '/' self.catalogueUrlRecursion(param, path, 1) else: Logger.error( 'param.txt gives an incorrect key compared to self.paramterList...' )
def startEspider(self): if configs.spider.mode != 'override' and configs.spider.mode != 'update': Logger.error('Please verify spider.mode is override or update in configs. Spider will run in default mode(override)') if configs.spider.mode == 'update' and self.parser == None: Logger.error('Spider cannot run in update mode without a correct function setParser() defined. ') Logger.info('Espider running in %s mode' %('override' if self.parser == None else 'update')) if self.parser != None: # update mode self.backupUpdate() self.contentDictList = self.loadContentDictList() Logger.info('start to get catalogue urls...') if configs.spider.loadurllistfromfile: self.loadCatalogueList() self.contentDictList = self.loadContentDictList() else: self.catalogueUrlRecursion(self.startUrl) writeLinesFile(configs.spider.cataloguefilename, self.catalogueUrl, method='w+') count = 0 for item in self.contentDictList: count = count + 1 MD5, filepath = self.contentHandler(item['contentUrl'], count) item['filepath'] = filepath if 'MD5' in item: if self.parser == None: item['update'] = 'disabled' elif item['MD5'] == MD5: item['update'] = 'false' else: item['update'] = 'true' item['MD5'] = MD5 else: if self.parser == None: item['update'] = 'disabled' else: item['update'] = 'true' item['MD5'] = MD5 self.saveContentUrlDictList() self.saveContentUrlUpdate() Logger.info('espider complete the task!')
def formProxy(self, count): if len(self.proxyList) == 0: self.proxy = None return if count >= len(self.proxyList): Logger.error('SelePhan proxy form error:out of range in proxyList...') self.proxy = None return proxy = self.proxyList[count] ipport = proxy['ip'] + ':' + proxy['port'] proxyDict = {'proxyType':ProxyType.MANUAL} if proxy['type'] == 'http': proxyDict['httpProxy'] = ipport elif proxy['type'] == 'socks': proxyDict['socksProxy'] = ipport else: self.proxy = None return self.proxy = seleProxy(proxyDict) return
def select(self,selectKeyList = None): temp = [] if selectKeyList == None: temp = '*' else: if not isinstance(selectKeyList, list): Logger.error('selectKeyList error because it is not a list...') return None temp = ','.join(selectKeyList) sql = 'SELECT %s FROM %s' %(temp, self.table) result = list(self.__sqlExecute(sql)) if selectKeyList == None: sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = '%s' and table_schema = '%s'" %(self.table, configs.mysql.db) selectKeyList = list(self.__sqlExecute(sql)) for i in range(len(selectKeyList)): selectKeyList[i] = selectKeyList[i][0] ret = [] for item in result: ret.append(dict(zip(selectKeyList, item))) return ret
def select(self, selectKeyList=None): temp = [] if selectKeyList == None: temp = '*' else: if not isinstance(selectKeyList, list): Logger.error('selectKeyList error because it is not a list...') return None temp = ','.join(selectKeyList) sql = 'SELECT %s FROM %s' % (temp, self.table) result = list(self.__sqlExecute(sql)) if selectKeyList == None: sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = '%s' and table_schema = '%s'" % ( self.table, configs.mysql.db) selectKeyList = list(self.__sqlExecute(sql)) for i in range(len(selectKeyList)): selectKeyList[i] = selectKeyList[i][0] ret = [] for item in result: ret.append(dict(zip(selectKeyList, item))) return ret
def __sqlExecute(self, sql): data = None try: connection = pymysql.connect(host=configs.mysql.host, port=configs.mysql.port, user=configs.mysql.user, password=configs.mysql.password, db=configs.mysql.db, charset='utf8') try: with connection.cursor() as cur: cur.execute(sql) data = cur.fetchall() connection.commit() except Exception: Logger.error('sql statement execute error:%s' % sql) finally: connection.close() except Exception: Logger.error('mysql database open with error...') return data
def formProxy(self, count): if len(self.proxyList) == 0: self.proxy = None return if count >= len(self.proxyList): Logger.error( 'SelePhan proxy form error:out of range in proxyList...') self.proxy = None return proxy = self.proxyList[count] ipport = proxy['ip'] + ':' + proxy['port'] proxyDict = {'proxyType': ProxyType.MANUAL} if proxy['type'] == 'http': proxyDict['httpProxy'] = ipport elif proxy['type'] == 'socks': proxyDict['socksProxy'] = ipport else: self.proxy = None return self.proxy = seleProxy(proxyDict) return
def insertWithUpdate(self, insertList): if not isinstance(insertList, list): Logger.error('insert list error because it is not a list...') if len(insertList) == 0: return keyList = [] for k in insertList[0]: keyList.append(k) if not self.checkKeyList(keyList): return temp1 = ','.join(self.keyList) for item in insertList: valueList = [] for k in self.keyList: valueList.append("'" + item[k] + "'") temp2 = ','.join(valueList) keyValueList = [] for i in range(1, len(self.keyList)): keyValueList.append(self.keyList[i] + '=' + "VALUES(" + self.keyList[i] + ")") temp3 = ','.join(keyValueList) sql = "INSERT INTO %s(%s)VALUES(%s)ON DUPLICATE KEY UPDATE %s" %(self.table, temp1, temp2, temp3) self.__sqlExecute(sql)
def catalogueUrlRecursion(self, param, path, level): if not os.path.exists(path): os.makedirs(path) Logger.info('(level %s)start to scrab param:%s' % (level, param)) if not isinstance(self.queryList[level - 1], list): self.queryList[level - 1] = [self.queryList[level - 1]] for query in self.queryList[level - 1]: url = self.buildUrl(query, param) url, headers = self.buildExtraHeaders(url) response = self.httpHandler.getResponseByUrl(url, headers=headers) data, type = self.contentResponseHandle(response) with open(path + 'data_query=' + query + '.' + type, 'w+', encoding='utf8') as f: f.write(data) if level == self.level: return try: nextParamList = self.contentHandler(data) except Exception: Logger.error('an error occured in contentHandler(). If this take place often, please shut espider down...') nextParamList = None if nextParamList == None or nextParamList == []: return if not isinstance(nextParamList, list): Logger.critical('contentHandler() should return a list. Espider is shutting down...') exit(1) if not isinstance(nextParamList[0], dict): Logger.critical('contentHandler() should return list made by dict of each element. Espider is shutting down...') exit(1) writeLinesFile(path + 'param_query=' + query + '.txt', nextParamList) for nextParam in nextParamList: for k,v in nextParam.items(): if k in self.parameterList[level]: nextParamDict = dict(param) nextParamDict[k] = v nextPath = path + k + '=' + v + '/' time.sleep(random.random() * configs.http.sleeptime) self.catalogueUrlRecursion(nextParamDict, nextPath, level + 1) else: pass
def contentHandler(self, url, count): url = urljoin(self.host, url) Logger.info('(%s%%)get content data from %s' % (round(100 * count / len(self.contentDictList), 2), url)) data = None type = '' name = None for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning('cannot get url %s. please check httphandler...' % url) return ('disabled', 'disabled') response = EsResponse(response) try: data, type = self.contentResponseHandle(response) if data == None: Logger.debug('data == None') raise Exception name = self.contentFileName(response) except Exception: Logger.error('an error occured in getUrlList(). if this take place very often, please check your code') self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() continue break if data == None: return ('disabled', 'disabled') if name == None: name = '%s.' % count + type if not os.path.exists(configs.spider.contentdatapath): os.makedirs(configs.spider.contentdatapath) if self.parser == None: MD5 = buildMD5String(data) else: try: parsedData = '%s' %self.parser.parseContent(data) MD5 = buildMD5String(parsedData) except Exception: Logger.error('An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5') MD5 = buildMD5String(data) filepath = configs.spider.contentdatapath + name try: if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css': with open(filepath, 'w+', encoding='utf8') as f: f.write(data) return (MD5, filepath) if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4': with open(filepath, 'wb+') as f: f.write(data) return (MD5, filepath) with open(filepath, 'wb+') as f: f.write(data) except OSError: Logger.error('anerrer occured when open %s' % configs.spider.contentdatapath + name) return (MD5, filepath)
def insertWithUpdate(self, insertList): if not isinstance(insertList, list): Logger.error('insert list error because it is not a list...') if len(insertList) == 0: return keyList = [] for k in insertList[0]: keyList.append(k) if not self.checkKeyList(keyList): return temp1 = ','.join(self.keyList) for item in insertList: valueList = [] for k in self.keyList: valueList.append("'" + item[k] + "'") temp2 = ','.join(valueList) keyValueList = [] for i in range(1, len(self.keyList)): keyValueList.append(self.keyList[i] + '=' + "VALUES(" + self.keyList[i] + ")") temp3 = ','.join(keyValueList) sql = "INSERT INTO %s(%s)VALUES(%s)ON DUPLICATE KEY UPDATE %s" % ( self.table, temp1, temp2, temp3) self.__sqlExecute(sql)
def contentHandler(self, url, count): url = urljoin(self.host, url) Logger.info('(%s%%)get content data from %s' % (round(100 * count / len(self.contentDictList), 2), url)) data = None type = '' name = None for i in range(configs.spider.retry): response = self.httpHandler.getResponseByUrl(url) if response == None: Logger.warning( 'cannot get url %s. please check httphandler...' % url) return ('disabled', 'disabled') response = EsResponse(response) try: data, type = self.contentResponseHandle(response) if data == None: Logger.debug('data == None') raise Exception name = self.contentFileName(response) except Exception: Logger.error( 'an error occured in getUrlList(). if this take place very often, please check your code' ) self.httpHandler.nextHandler() if i == configs.spider.retry - 1: self.uncatchableUrlList.append(url) self.saveUncatchableUrl() continue break if data == None: return ('disabled', 'disabled') if name == None: name = '%s.' % count + type if not os.path.exists(configs.spider.contentdatapath): os.makedirs(configs.spider.contentdatapath) if self.parser == None: MD5 = buildMD5String(data) else: try: parsedData = '%s' % self.parser.parseContent(data) MD5 = buildMD5String(parsedData) except Exception: Logger.error( 'An error occured in parseContent()! Please check your code. Espider will use the whole file as update md5' ) MD5 = buildMD5String(data) filepath = configs.spider.contentdatapath + name try: if type == 'html' or type == 'xml' or type == 'json' or type == 'js' or type == 'css': with open(filepath, 'w+', encoding='utf8') as f: f.write(data) return (MD5, filepath) if type == 'jpg' or type == 'tif' or type == 'ico' or type == 'png' or type == 'bmp' or type == 'mp3' or type == 'avi' or type == 'mp4': with open(filepath, 'wb+') as f: f.write(data) return (MD5, filepath) with open(filepath, 'wb+') as f: f.write(data) except OSError: Logger.error('anerrer occured when open %s' % configs.spider.contentdatapath + name) return (MD5, filepath)
def setData(self, data): if not isinstance(data, str): Logger.error('PhantomJsResponse setData() error: data is not a str...') return self.data = bytes(data, encoding='utf8')
def setData(self, data): if not isinstance(data, str): Logger.error( 'PhantomJsResponse setData() error: data is not a str...') return self.data = bytes(data, encoding='utf8')
def setHeaders(self, headers): if not isinstance(headers, dict): Logger.error('PhantomJsResponse setHeaders() error: headers is not a dict...') return self.headers = headers