def _worker(self): while 1: url=self.queue.get() if url in self.visited: continue else: result=get_html.delay(url) try: html=result.get(timeout=5) except Exception as e: print(url) print(e) self.process_html(html) self._add_links_to_queue(url,html) self.visited[url]=True self.queue.task_done()
def _worker(self): while True: url = self.queue.get() if url in self.visited: continue else: result = get_html.delay(url) html = None try: html = result.get(timeout=5) except Exception as e: print(url) print(e) if html: self.process_html(html) # 处理爬取的页面. self._add_links_to_queue(url, html) self.visited[url] = True self.queue.task_done()
def _worker(self): while 1: url = self.queue.get() print url if url in self.visited: continue else: # here is sent tasks to celery, return a AsyncResult result = get_html.delay(url) html=None try: html = result.get(timeout=60) except Exception as e: print(url) print(e) if html: self.process_html(html) self._add_links_to_queue(url, html) self.visited[url] = True self.queue.task_done()