def __init__(self): ''' 初始化当前深度, 正常情况下是从0开始 ''' logger.info("init process~") print u"=== 当前进度为:" self.deep = 0 self._pbar = click.progressbar(length=1, label="deep 0 : ")
def __init__(self): ''' 初始化当前深度, 正常情况下是从0开始 ''' logger.info("init process~") print u"=== 当前进度为:" self.deep = 0 self._pbar = click.progressbar(length=1, label="deep 0 : ")
def _finish_pbar(self): ''' 结束上一个进度条 :return: ''' logger.info("deep %s over!" % (self.deep - 1)) self._pbar.finish() self._pbar.render_progress() self._pbar.render_finish()
def _finish_pbar(self): ''' 结束上一个进度条 :return: ''' logger.info("deep %s over!" %(self.deep-1)) self._pbar.finish() self._pbar.render_progress() self._pbar.render_finish()
def run(self): """ Take jobs in the queue (url to query), parse it and save results """ global linksVisited global websiteIndex global logger dataLock = self.dataLock while True: # Get the job(link) from the queue and parse link queueItem = self.queue.get() currentLink = urlparse(queueItem) # Make sure link wasn't already visited and add it to the list of visited links with dataLock: if currentLink.path in linksVisited: self.queue.task_done() continue linksVisited.add(currentLink.path) logger.info("Thread with id " + str(self.id) + " starts crawling " + currentLink.path) # Query page - Add some headers so that websites such as Monzo.com aren't afraid and answer ;) try: req = urllib.request.Request(urllib.parse.urljoin(self.domain, currentLink.path), headers={'User-Agent': 'Mozilla/5.0'}) webPage = urllib.request.urlopen( req ) except urllib.error.HTTPError as e: # Whoops it wasn't a 200 logger.error("Error - Thread with id " + str(self.id) + " while crawling " + urllib.parse.urljoin(self.domain, currentLink.path) + ": " + str(e)) self.queue.task_done() continue # Create instance of HTML parser try: htmlParser = Parser(self.domain) htmlParser.feed(str(webPage.read())) htmlParser.close() except UnboundLocalError: logger.error("Error - Thread with id " + str(self.id) + " while parsing " + urllib.parse.urljoin(self.domain, currentLink.path)) self.queue.task_done() continue # Find remaining links to visit (again syncrhonised so that link aren't handled twice) with self.dataLock: # Save links and assets as a tuple websiteIndex[currentLink.path] = (htmlParser.links,htmlParser.staticAssets) linksNotVisitedYet = htmlParser.links.difference(linksVisited) # Add links to visit for link in linksNotVisitedYet: self.queue.put(link) self.queue.task_done()
def build_thread_pool(self, f): ''' 创建线程池 :param f: 函数(非通用,仅支持单参数) :return: 线程池列表 ''' logger.info("build thread pool, and the number is " + str(self.num)) for x in xrange(self.num): pool = SiThread(self._queue, self._result, f) pool.start() self.pool.append(pool)
def _del(self): global stop logger.info("destory thread pool") for x in self.pool: self._queue.put('stop') for t in self.pool: t.join() self._dbqueue.put('stop') self.dbt.join() logger.debug("destory thread pool suc")
def run(self): global stop while True: r = self._queue.get() if r == 'stop': logger.info("database thread be stoped") break if r['type'] == 'html': operate['db'].insert(r['html'], r['url']) else: logger.warn("not a html page")
def run(self): global stop logger.debug("run a thread") while True: url = self._queue.get() if url == 'stop': logger.info( str(threading.currentThread().ident) + " be stoped") break r = self._f(url) self._result.put(r)
def _run(self, length, queue): ''' 启动方法 :param length: 当前深度爬取的URL长度 :return: None ''' self.deep += 1 logger.info("begin run deep %s process bar, the length is %s" % (self.deep, length)) label = "deep %s: " %(self.deep) self._queue = queue self.length = length # 结束上一个进度条 self._finish_pbar() # 新建一个进度条 self._pbar = click.progressbar(length=length, label=label, show_percent=False, show_pos=True) # 每隔10s显示一次进度 self.t = threading.Thread(target=self.timer) self.t.start()
def __init__(self, num, func): ''' self._queue 参数队列 self._result 返回结果队列 self.pool 线程池 :param num: 线程数 :param func: 函数 ''' logger.info("init a thread pool class") self._queue = Queue.Queue() self._result = Queue.Queue() self._dbqueue = Queue.Queue() self.pool = [] self.num = num # 线程池 self.build_thread_pool(func) # insert db 线程 self.dbt = DbThread(self._dbqueue) self.dbt.start() # 显示进度 self.process_bar = ShowProcess()
def _run(self, length, queue): ''' 启动方法 :param length: 当前深度爬取的URL长度 :return: None ''' self.deep += 1 logger.info("begin run deep %s process bar, the length is %s" % (self.deep, length)) label = "deep %s: " % (self.deep) self._queue = queue self.length = length # 结束上一个进度条 self._finish_pbar() # 新建一个进度条 self._pbar = click.progressbar(length=length, label=label, show_percent=False, show_pos=True) # 每隔10s显示一次进度 self.t = threading.Thread(target=self.timer) self.t.start()
def __init__(self, queue1, queue2, f): threading.Thread.__init__(self) self._queue = queue1 self._result = queue2 self._f = f logger.info('init a thread')