Beispiel #1
0
    def doWork(self):
        '''重写WorkRequest类的线程执行函数,此函数将在线程池中执行,
	   功能:从为自己分配的下载队列中取出url进行下载
	   '''
        logger.debug('Start downloader`s doWork...')
#	self.test()
        while True:
            if self.__dlQueue.qsize() > 0:
                urlNode = self.__dlQueue.get()
                self.__downloadingFlag += 1
                page = self.__downloadPage(urlNode.url)
                if len(page) == 0:
                    self.__downloadingFlag -= 1
                    continue
#                logger.debug('download page success, url: %s', urlNode.url)
                # 将下载的html页面封装为内部数据格式并添加到html队列供解析模块解析
                htmlNode = HtmlModel(urlNode.url, page, timestamp(), urlNode.depth)
                self.__htmlQueue.put(htmlNode)
                self.__downloadingFlag -= 1
            # 检测退出事件
            if self.__exitEvent.is_set():
                logger.info('Download model quit...')
                return
            # 下载时间间隔
            time.sleep(FETCH_TIME_INTERVAL)
Beispiel #2
0
    def doWork(self):
        '''重写WorkRequest类的线程执行函数,此函数将在线程池中执行,
	   功能:从为自己分配的下载队列中取出url进行下载
	   '''
        logger.debug('Start downloader`s doWork...')
        #	self.test()
        while True:
            if self.__dlQueue.qsize() > 0:
                urlNode = self.__dlQueue.get()
                self.__downloadingFlag += 1
                page = self.__downloadPage(urlNode.url)
                if len(page) == 0:
                    self.__downloadingFlag -= 1
                    continue
#                logger.debug('download page success, url: %s', urlNode.url)
# 将下载的html页面封装为内部数据格式并添加到html队列供解析模块解析
                htmlNode = HtmlModel(urlNode.url, page, timestamp(),
                                     urlNode.depth)
                self.__htmlQueue.put(htmlNode)
                self.__downloadingFlag -= 1
            # 检测退出事件
            if self.__exitEvent.is_set():
                logger.info('Download model quit...')
                return
            # 下载时间间隔
            time.sleep(FETCH_TIME_INTERVAL)
Beispiel #3
0
def sign_up_user(name, surname, codice_fiscale, birthday, language):
    # check if file exits
    if not users_file.is_file():
        h.append_new_line(users_file,
                          'userid,name,surname,cf,birthday,language')

    new_user = h.generate_new_username(
    ) + ',' + name + ',' + surname + ',' + codice_fiscale + ',' + birthday + ',' + language
    h.append_new_line(users_file, new_user)

    new_user_log = 'user:'******', userName:'******', userSurname:' + surname + ', user_CF:' + codice_fiscale + ', birthday:' + birthday + ', language:' + language
    h.append_new_line(
        logs_file,
        str(h.timestamp()) + ' [DEBUG] - New user - ' + new_user_log)
Beispiel #4
0
    def createLogger(self):
	'''创建并配置logger'''
	self.__createLogDir()

        logger = logging.getLogger('mylogger')
        logger.setLevel(logging.DEBUG)

        # 创建文件日志处理器
        fh = logging.FileHandler(self.logDir + timestamp() +'_zspider.log')
        # 创建控制台日志处理器
        ch = logging.StreamHandler()

        # 控制两种日志输出方式的级别
        fh.setLevel(logging.DEBUG)
        ch.setLevel(logging.DEBUG)

        # 日志输出格式
        formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)

        logger.addHandler(fh)
        logger.addHandler(ch)
	return logger
Beispiel #5
0
class Parser(WorkRequest):
    '''继承自线程池中的WorkRequest类,并实现线程执行函数
       功能: 过滤html页面,判断其是否符合存储条件并将符合条件的页面放入data队列
             解析html页面,过滤出符合条件的url并将其放入url队列
    '''
    def __init__(self, depth, startUrls, keyword, htmlQueue, dataQueue,
                 urlQueue, exitEvent):
        self.__htmlQueue = htmlQueue
        self.__dataQueue = dataQueue
        self.__urlQueue = urlQueue
        self.__keyword = keyword
        self.__depth = depth
        self.__startUrls = startUrls
        self.__exitEvent = exitEvent
        # pageFilter用于页面过滤,判断此页面是否需要存储
        self.__myPageFilter = PageFilter(keyword)
        # urlFilter用于url过滤,判断url是否需要继续下载
        self.__myUrlFilter = UrlFilter(self.__startUrls)

    def getRepeatSetSize(self):
        return self.__myUrlFilter.getRepeatSetSize()

    def __parsePage(self):
        '''解析函数,完成解析模块的核心功能'''
        htmlNode = self.__htmlQueue.get()

        # 过滤页面,判断页面是否需要存储
        if self.__myPageFilter.isGood(htmlNode.html):
            dataNode = HtmlModel(htmlNode.url, '', htmlNode.time,
                                 htmlNode.depth)
            self.__dataQueue.put(dataNode)

    # 爬取深度控制,如果爬取深度达到指定深度则不继续解析页面中的链接
        if htmlNode.depth >= self.__depth:
            return

        linkList = []
        try:
            # 解析html页面中的所有链接,使用lxml模块
            doc = lxml.html.document_fromstring(htmlNode.html)
            doc.make_links_absolute(htmlNode.url)
            links = doc.iterlinks()
            for link in links:
                linkList.append(link[2])
        except Exception, e:
            logger.warning('Parse page exception: %s', str(e))
            return

        if len(linkList) == 0:
            logger.warning('Parse page success, but link is null: %s',
                           htmlNode.url)
            return

    # 过滤url,包括去url重复、特定后缀以及站外链接
        linkList = self.__myUrlFilter.urlfilter(linkList)

        # 将符合条件的url重新添加回url队列
        for url in linkList:
            urlNode = UrlModel(url, htmlNode.url, timestamp(),
                               htmlNode.depth + 1)
            self.__urlQueue.put(urlNode)
Beispiel #6
0
def log_out_user(userid, storecode):
    h.append_new_line(
        logs_file,
        str(h.timestamp()) + ' [DEBUG] - User logout - ' + 'user:'******', store:' + storecode)
Beispiel #7
0
def sign_in_user(userid, storecode):
    h.append_new_line(
        logs_file,
        str(h.timestamp()) + ' [DEBUG] - User login - ' + 'user:'******', store:' + storecode)
Beispiel #8
0
def log_timestamp():
    h.append_new_line(logs_file, '### ' + str(h.timestamp()) + ' ###')
Beispiel #9
0
 def __initUrlQueue(self, urlList):
     '''将url封装为内部数据格式'''
     for url in urlList:
         urlNode = UrlModel(url, '', timestamp(), 0)
         self.__urlQueue.put(urlNode)
Beispiel #10
0
def order(userid, storecode, productids):
	order_id = str(next_product_id_for_store(storecode))
	err = random_error(order_id, storecode)
	if not err:
		h.append_new_line(file_logs, str(h.timestamp()) + ' [INFO] - New order - ' + 'user:'******', products:'+productids +', store:'+storecode+', country:'+h.countrycode()+', totPrice:'+str(h.price())+h.currency()+', paymentMethod:'+h.payment_method()+', orderId:'+order_id)