def doWork(self): '''重写WorkRequest类的线程执行函数,此函数将在线程池中执行, 功能:从为自己分配的下载队列中取出url进行下载 ''' logger.debug('Start downloader`s doWork...') # self.test() while True: if self.__dlQueue.qsize() > 0: urlNode = self.__dlQueue.get() self.__downloadingFlag += 1 page = self.__downloadPage(urlNode.url) if len(page) == 0: self.__downloadingFlag -= 1 continue # logger.debug('download page success, url: %s', urlNode.url) # 将下载的html页面封装为内部数据格式并添加到html队列供解析模块解析 htmlNode = HtmlModel(urlNode.url, page, timestamp(), urlNode.depth) self.__htmlQueue.put(htmlNode) self.__downloadingFlag -= 1 # 检测退出事件 if self.__exitEvent.is_set(): logger.info('Download model quit...') return # 下载时间间隔 time.sleep(FETCH_TIME_INTERVAL)
def sign_up_user(name, surname, codice_fiscale, birthday, language): # check if file exits if not users_file.is_file(): h.append_new_line(users_file, 'userid,name,surname,cf,birthday,language') new_user = h.generate_new_username( ) + ',' + name + ',' + surname + ',' + codice_fiscale + ',' + birthday + ',' + language h.append_new_line(users_file, new_user) new_user_log = 'user:'******', userName:'******', userSurname:' + surname + ', user_CF:' + codice_fiscale + ', birthday:' + birthday + ', language:' + language h.append_new_line( logs_file, str(h.timestamp()) + ' [DEBUG] - New user - ' + new_user_log)
def createLogger(self): '''创建并配置logger''' self.__createLogDir() logger = logging.getLogger('mylogger') logger.setLevel(logging.DEBUG) # 创建文件日志处理器 fh = logging.FileHandler(self.logDir + timestamp() +'_zspider.log') # 创建控制台日志处理器 ch = logging.StreamHandler() # 控制两种日志输出方式的级别 fh.setLevel(logging.DEBUG) ch.setLevel(logging.DEBUG) # 日志输出格式 formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) return logger
class Parser(WorkRequest): '''继承自线程池中的WorkRequest类,并实现线程执行函数 功能: 过滤html页面,判断其是否符合存储条件并将符合条件的页面放入data队列 解析html页面,过滤出符合条件的url并将其放入url队列 ''' def __init__(self, depth, startUrls, keyword, htmlQueue, dataQueue, urlQueue, exitEvent): self.__htmlQueue = htmlQueue self.__dataQueue = dataQueue self.__urlQueue = urlQueue self.__keyword = keyword self.__depth = depth self.__startUrls = startUrls self.__exitEvent = exitEvent # pageFilter用于页面过滤,判断此页面是否需要存储 self.__myPageFilter = PageFilter(keyword) # urlFilter用于url过滤,判断url是否需要继续下载 self.__myUrlFilter = UrlFilter(self.__startUrls) def getRepeatSetSize(self): return self.__myUrlFilter.getRepeatSetSize() def __parsePage(self): '''解析函数,完成解析模块的核心功能''' htmlNode = self.__htmlQueue.get() # 过滤页面,判断页面是否需要存储 if self.__myPageFilter.isGood(htmlNode.html): dataNode = HtmlModel(htmlNode.url, '', htmlNode.time, htmlNode.depth) self.__dataQueue.put(dataNode) # 爬取深度控制,如果爬取深度达到指定深度则不继续解析页面中的链接 if htmlNode.depth >= self.__depth: return linkList = [] try: # 解析html页面中的所有链接,使用lxml模块 doc = lxml.html.document_fromstring(htmlNode.html) doc.make_links_absolute(htmlNode.url) links = doc.iterlinks() for link in links: linkList.append(link[2]) except Exception, e: logger.warning('Parse page exception: %s', str(e)) return if len(linkList) == 0: logger.warning('Parse page success, but link is null: %s', htmlNode.url) return # 过滤url,包括去url重复、特定后缀以及站外链接 linkList = self.__myUrlFilter.urlfilter(linkList) # 将符合条件的url重新添加回url队列 for url in linkList: urlNode = UrlModel(url, htmlNode.url, timestamp(), htmlNode.depth + 1) self.__urlQueue.put(urlNode)
def log_out_user(userid, storecode): h.append_new_line( logs_file, str(h.timestamp()) + ' [DEBUG] - User logout - ' + 'user:'******', store:' + storecode)
def sign_in_user(userid, storecode): h.append_new_line( logs_file, str(h.timestamp()) + ' [DEBUG] - User login - ' + 'user:'******', store:' + storecode)
def log_timestamp(): h.append_new_line(logs_file, '### ' + str(h.timestamp()) + ' ###')
def __initUrlQueue(self, urlList): '''将url封装为内部数据格式''' for url in urlList: urlNode = UrlModel(url, '', timestamp(), 0) self.__urlQueue.put(urlNode)
def order(userid, storecode, productids): order_id = str(next_product_id_for_store(storecode)) err = random_error(order_id, storecode) if not err: h.append_new_line(file_logs, str(h.timestamp()) + ' [INFO] - New order - ' + 'user:'******', products:'+productids +', store:'+storecode+', country:'+h.countrycode()+', totPrice:'+str(h.price())+h.currency()+', paymentMethod:'+h.payment_method()+', orderId:'+order_id)