Beispiel #1
0
    def _worker(self):
        while 1:
            url=self.queue.get()
            if url in self.visited:
                continue
            else:
                result=get_html.delay(url)
                try:
                    html=result.get(timeout=5)
                except Exception as e:
                    print(url)
                    print(e)
                self.process_html(html)
                self._add_links_to_queue(url,html)

                self.visited[url]=True
                self.queue.task_done()
Beispiel #2
0
    def _worker(self):
        while True:
            url = self.queue.get()
            if url in self.visited:
                continue
            else:
                result = get_html.delay(url)
                html = None
                try:
                    html = result.get(timeout=5)
                except Exception as e:
                    print(url)
                    print(e)
                if html:
                    self.process_html(html)  # 处理爬取的页面.
                    self._add_links_to_queue(url, html)

                self.visited[url] = True
                self.queue.task_done()
Beispiel #3
0
 def _worker(self):
     while 1:
         url = self.queue.get()
         print url
         if url in self.visited:
             continue
         else:
             # here is sent tasks to celery, return a AsyncResult
             result = get_html.delay(url)
             html=None
             try:
                 html = result.get(timeout=60)
             except Exception as e:
                 print(url)
                 print(e)
             
             if html:
                 self.process_html(html)
                 self._add_links_to_queue(url, html)
                 self.visited[url] = True
             self.queue.task_done()