Exemple #1
0
 def __init__(self):
     monkey.patch_all()
     self.queue = queue.Queue()
     self.pool = pool.Pool(
         int(CoreConfigure().get_config_section_map("spider")
             ['concurrency']))
     self.url_table = UrlTable()
     self.timer = Timer(
         int(CoreConfigure().get_config_section_map("spider")['timeout']),
         self.stop)
     self._stop = event.Event()
     self.greenlet_finished = event.Event()
     self.root = None  # url_object
     self.initialize_db()
Exemple #2
0
 def __init__(self):
     monkey.patch_all()
     self.queue = queue.Queue()
     self.pool = pool.Pool(int(CoreConfigure().get_config_section_map("spider")['concurrency']))
     self.url_table = UrlTable()
     self.timer = Timer(int(CoreConfigure().get_config_section_map("spider")['timeout']), self.stop)
     self._stop = event.Event()
     self.greenlet_finished = event.Event()
     self.root = None  # url_object
     self.initialize_db()
Exemple #3
0
class Spider(object):
    def __init__(self):
        monkey.patch_all()
        self.queue = queue.Queue()
        self.pool = pool.Pool(int(CoreConfigure().get_config_section_map("spider")['concurrency']))
        self.url_table = UrlTable()
        self.timer = Timer(int(CoreConfigure().get_config_section_map("spider")['timeout']), self.stop)
        self._stop = event.Event()
        self.greenlet_finished = event.Event()
        self.root = None  # url_object
        self.initialize_db()
    def initialize_db(self):
        host = CoreConfigure().get_config_section_map('db')['host']
        port = CoreConfigure().get_config_section_map('db')['port']
        db_name = CoreConfigure().get_config_section_map('db')['database']
        collect_name = CoreConfigure().get_config_section_map('db')['collection']
        self.db_cli = MongoClient(host, int(port))
        # mongodb collection
        self.collection = self.db_cli[db_name][collect_name]
    def set_root(self, url):
        if isinstance(url, basestring):
            url = UrlObj(url, type = 0)
            self.root = url
        self.push_task(self.root)
        self.url_table.insert(self.root)

    def push_task(self, url):
        # type(url) is UrlObj
        if url not in self.url_table:
            self.queue.put(url)
    def run(self, url = None):
        begin = time.time()
        if url is None:
            # read from configure file for default value
            url = CoreConfigure().get_config_section_map('content')['root_url']
        self.set_root(url)
        self.timer.start()
        logger.info("spider begin crawl")
        while not self.stopped() and self.timer.isAlive():
            for greenlet in list(self.pool):
                if greenlet.dead:
                    self.pool.discard(greenlet)
            try:
                url = self.queue.get_nowait()
            except queue.Empty:
                if self.pool.free_count() != self.pool.size:
                    # wait until one greenlet finish to flash queue
                    self.greenlet_finished.wait()
                    self.greenlet_finished.clear()
                    continue
                else:
                    self.stop()
            greenlet = Handler(url, self)
            self.pool.start(greenlet)
        logger.info("total time elapsed %0.2f" % (time.time() - begin))
    def stopped(self):
        return self._stop.is_set()

    def stop(self):
        logger.info("spider finish, totally catched (%d) urls" % len(self.url_table))
        self.timer.cancel()
        self._stop.set()
        self.pool.join()
        self.queue.put(StopIteration)
Exemple #4
0
class Spider(object):
    def __init__(self):
        monkey.patch_all()
        self.queue = queue.Queue()
        self.pool = pool.Pool(
            int(CoreConfigure().get_config_section_map("spider")
                ['concurrency']))
        self.url_table = UrlTable()
        self.timer = Timer(
            int(CoreConfigure().get_config_section_map("spider")['timeout']),
            self.stop)
        self._stop = event.Event()
        self.greenlet_finished = event.Event()
        self.root = None  # url_object
        self.initialize_db()

    def initialize_db(self):
        host = CoreConfigure().get_config_section_map('db')['host']
        port = CoreConfigure().get_config_section_map('db')['port']
        db_name = CoreConfigure().get_config_section_map('db')['database']
        collect_name = CoreConfigure().get_config_section_map(
            'db')['collection']
        self.db_cli = MongoClient(host, int(port))
        # mongodb collection
        self.collection = self.db_cli[db_name][collect_name]

    def set_root(self, url):
        if isinstance(url, basestring):
            url = UrlObj(url, type=0)
            self.root = url
        self.push_task(self.root)
        self.url_table.insert(self.root)

    def push_task(self, url):
        # type(url) is UrlObj
        if url not in self.url_table:
            self.queue.put(url)

    def run(self, url=None):
        begin = time.time()
        if url is None:
            # read from configure file for default value
            url = CoreConfigure().get_config_section_map('content')['root_url']
        self.set_root(url)
        self.timer.start()
        logger.info("spider begin crawl")
        while not self.stopped() and self.timer.isAlive():
            for greenlet in list(self.pool):
                if greenlet.dead:
                    self.pool.discard(greenlet)
            try:
                url = self.queue.get_nowait()
            except queue.Empty:
                if self.pool.free_count() != self.pool.size:
                    # wait until one greenlet finish to flash queue
                    self.greenlet_finished.wait()
                    self.greenlet_finished.clear()
                    continue
                else:
                    self.stop()
            greenlet = Handler(url, self)
            self.pool.start(greenlet)
        logger.info("total time elapsed %0.2f" % (time.time() - begin))

    def stopped(self):
        return self._stop.is_set()

    def stop(self):
        logger.info("spider finish, totally catched (%d) urls" %
                    len(self.url_table))
        self.timer.cancel()
        self._stop.set()
        self.pool.join()
        self.queue.put(StopIteration)