Beispiel #1
0
class Test(unittest.TestCase):


    def setUp(self):
        self.dir_ = tempfile.mkdtemp()
        self.addr = '127.0.0.1'
        self.addrs = [self.addr, ]
        
    def tearDown(self):
        try:
            self.mq.shutdown()
        finally:
            shutil.rmtree(self.dir_)


    def testMqProxy(self):
        self.mq = MessageQueue(self.dir_, None, self.addr, self.addrs,
                                  copies=0, n_priorities=1)
        self.proxy = MpMessageQueueClient(self.mq.new_connection('0'))
        
        try:
            test_obj = Url('http://qinxuye.me')
            self.proxy.put(test_obj, )
            self.assertEqual(self.proxy.get(), test_obj)
        finally:
            self.mq.shutdown()
Beispiel #2
0
 def testAddNode(self):
     data = range(100)
       
     new_port = random.randint(10000, 30000)
     new_node = 'localhost:%s' % new_port
     new_rpc_server = ColaRPCServer(('localhost', new_port))
     thd = threading.Thread(target=new_rpc_server.serve_forever)
     thd.setDaemon(True)
     thd.start()
     new_dir = tempfile.mkdtemp()
     ns = list(self.nodes)
     ns.append(new_node)
     new_mq = MessageQueue(new_dir, new_rpc_server, new_node, ns)
       
     try:
         self.mq0.add_node(new_node)
         self.mq1.add_node(new_node)
         self.mq2.add_node(new_node)
           
         self.mq0.put(data)
           
         self.assertEqual(data, sorted(self.mq0.get(size=100)))
     finally:
         try:
             new_rpc_server.shutdown()
             new_mq.shutdown()
         finally:
             shutil.rmtree(new_dir)
Beispiel #3
0
    def testAddNode(self):
        data = range(100)

        new_port = random.randint(10000, 30000)
        new_node = 'localhost:%s' % new_port
        new_rpc_server = ColaRPCServer(('localhost', new_port))
        thd = threading.Thread(target=new_rpc_server.serve_forever)
        thd.setDaemon(True)
        thd.start()
        new_dir = tempfile.mkdtemp()
        ns = list(self.nodes)
        ns.append(new_node)
        new_mq = MessageQueue(new_dir, new_rpc_server, new_node, ns)

        try:
            self.mq0.add_node(new_node)
            self.mq1.add_node(new_node)
            self.mq2.add_node(new_node)

            self.mq0.put(data)

            self.assertEqual(data, sorted(self.mq0.get(size=100)))
        finally:
            try:
                new_rpc_server.shutdown()
                new_mq.shutdown()
            finally:
                shutil.rmtree(new_dir)
Beispiel #4
0
class Test(unittest.TestCase):

    def setUp(self):
        self.dir_ = tempfile.mkdtemp()
        self.addr = '127.0.0.1'
        self.addrs = [self.addr, ]
        
    def tearDown(self):
        try:
            self.mq.shutdown()
        finally:
            shutil.rmtree(self.dir_)

    def testMqProxy(self):
        self.mq = MessageQueue(self.dir_, None, self.addr, self.addrs,
                                  copies=0, n_priorities=1)
        self.proxy = MpMessageQueueClient(self.mq.new_connection('0'))
        
        try:
            test_obj = Url(u'http://qinxuye.me/三星')
            self.proxy.put(test_obj, )
            self.assertEqual(self.proxy.get(), test_obj)
            test_obj = u'三星'
            self.proxy.put(test_obj, )
            self.assertEqual(self.proxy.get(), test_obj)
        finally:
            self.mq.shutdown()
Beispiel #5
0
 def testMqProxy(self):
     self.mq = MessageQueue(self.dir_, None, self.addr, self.addrs,
                               copies=0, n_priorities=1)
     self.proxy = MpMessageQueueClient(self.mq.new_connection('0'))
     
     try:
         test_obj = Url(u'http://qinxuye.me/三星')
         self.proxy.put(test_obj, )
         self.assertEqual(self.proxy.get(), test_obj)
         test_obj = u'三星'
         self.proxy.put(test_obj, )
         self.assertEqual(self.proxy.get(), test_obj)
     finally:
         self.mq.shutdown()
Beispiel #6
0
    def init_mq(self):
        mq_store_dir = os.path.join(self.root, 'store')
        mq_backup_dir = os.path.join(self.root, 'backup')
        if not os.path.exists(mq_store_dir):
            os.makedirs(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.makedirs(mq_backup_dir)

        self.mq = MessageQueue(self.nodes,
                               self.local,
                               self.rpc_server,
                               copies=self.copies)
        self.mq.init_store(mq_store_dir,
                           mq_backup_dir,
                           verify_exists_hook=self._init_bloom_filter())
Beispiel #7
0
    def init_mq(self):
        mq_dir = os.path.join(self.working_dir, 'mq')
        copies = self.job_desc.settings.job.copies
        n_priorities = self.job_desc.settings.job.priorities

        kw = {
            'app_name': self.job_name,
            'copies': copies,
            'n_priorities': n_priorities,
            'deduper': self.deduper
        }
        self.mq = MessageQueue(mq_dir, self.rpc_server, self.ctx.worker_addr,
                               self.ctx.addrs[:], **kw)
        # register shutdown callback
        self.shutdown_callbacks.append(self.mq.shutdown)
class MessageQueueClient(object):
    def __init__(self, nodes, copies=1):
        self.nodes = nodes
        self.hash_ring = HashRing(self.nodes)
        self.copies = max(min(len(self.nodes) - 1, copies), 0)
        self.mq = MessageQueue(nodes, copies=copies)

    def put(self, objs):
        self.mq.put(objs)

    def get(self):
        for n in self.nodes:
            obj = self.mq._get(n)
            if obj is not None:
                return obj
Beispiel #9
0
class MessageQueueClient(object):
    
    def __init__(self, nodes, copies=1):
        self.nodes = nodes
        self.hash_ring = HashRing(self.nodes)
        self.copies = max(min(len(self.nodes)-1, copies), 0)
        self.mq = MessageQueue(nodes, copies=copies)
        
    def put(self, objs):
        self.mq.put(objs)
        
    def get(self):
        for n in self.nodes:
            obj = self.mq._get(n)
            if obj is not None:
                return obj
Beispiel #10
0
    def init_mq(self):
        mq_store_dir = os.path.join(self.root, "store")
        mq_backup_dir = os.path.join(self.root, "backup")
        if not os.path.exists(mq_store_dir):
            os.makedirs(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.makedirs(mq_backup_dir)

        self.mq = MessageQueue(self.nodes, self.local, self.rpc_server, copies=self.copies)
        self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=self._init_bloom_filter())
Beispiel #11
0
 def init_mq(self):
     mq_dir = os.path.join(self.working_dir, 'mq')
     copies = self.job_desc.settings.job.copies
     n_priorities = self.job_desc.settings.job.priorities
     
     kw = {'app_name': self.job_name, 'copies': copies, 
           'n_priorities': n_priorities, 'deduper': self.deduper}
     self.mq = MessageQueue(mq_dir, self.rpc_server, self.ctx.worker_addr, 
         self.ctx.addrs[:], **kw)
     # register shutdown callback
     self.shutdown_callbacks.append(self.mq.shutdown)
Beispiel #12
0
 def testMqProxy(self):
     self.mq = MessageQueue(self.dir_, None, self.addr, self.addrs,
                               copies=0, n_priorities=1)
     self.proxy = MpMessageQueueClient(self.mq.new_connection('0'))
     
     try:
         test_obj = Url('http://qinxuye.me')
         self.proxy.put(test_obj, )
         self.assertEqual(self.proxy.get(), test_obj)
     finally:
         self.mq.shutdown()
Beispiel #13
0
    def init_mq(self,
                nodes,
                local_node,
                loc,
                verify_exists_hook=None,
                copies=1):
        mq_store_dir = os.path.join(loc, 'store')
        mq_backup_dir = os.path.join(loc, 'backup')
        if not os.path.exists(mq_store_dir):
            os.mkdir(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.mkdir(mq_backup_dir)

        # MQ relative
        self.mq = MessageQueue(nodes,
                               local_node,
                               self.rpc_server,
                               copies=copies)
        self.mq.init_store(mq_store_dir,
                           mq_backup_dir,
                           verify_exists_hook=verify_exists_hook)
Beispiel #14
0
    def init_mq(self):
        mq_store_dir = os.path.join(self.root, 'store')
        mq_backup_dir = os.path.join(self.root, 'backup')
        if not os.path.exists(mq_store_dir):
            os.makedirs(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.makedirs(mq_backup_dir)
            
        self.mq = MessageQueue(self.nodes, self.local, self.rpc_server,
            copies=self.copies)
        self.mq.init_store(mq_store_dir, mq_backup_dir,
                           verify_exists_hook=self._init_bloom_filter())

        self.redismq = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)
Beispiel #15
0
 def setUp(self):
     ports = tuple([random.randint(10000, 30000) for _ in range(3)])
     self.nodes = ['localhost:%s'%port for port in ports]
     self.dirs = [tempfile.mkdtemp() for _ in range(len(ports))]
     self.size = len(ports)
     
     for i in range(self.size):
         setattr(self, 'rpc_server%s'%i, ColaRPCServer(('localhost', ports[i])))
         setattr(self, 'mq%s'%i, 
             MessageQueue(self.dirs[i], getattr(self, 'rpc_server%s'%i), 
                          self.nodes[i], self.nodes[:])
         )
         thd = threading.Thread(target=getattr(self, 'rpc_server%s'%i).serve_forever)
         thd.setDaemon(True)
         thd.start()
         
     self.client = MessageQueueClient(self.nodes)
Beispiel #16
0
 def init_mq(self, nodes, local_node, loc, 
             verify_exists_hook=None, copies=1):
     mq_store_dir = os.path.join(loc, 'store')
     mq_backup_dir = os.path.join(loc, 'backup')
     if not os.path.exists(mq_store_dir):
         os.mkdir(mq_store_dir)
     if not os.path.exists(mq_backup_dir):
         os.mkdir(mq_backup_dir)
     
     # MQ relative
     self.mq = MessageQueue(
         nodes,
         local_node,
         self.rpc_server,
         copies=copies
     )
     self.mq.init_store(mq_store_dir, mq_backup_dir, 
                        verify_exists_hook=verify_exists_hook)
Beispiel #17
0
    def setUp(self):
        ports = (11111, 11211, 11311)
        self.nodes = ['localhost:%s' % port for port in ports]
        self.dirs = [tempfile.mkdtemp() for _ in range(2 * len(ports))]
        self.size = len(ports)

        for i in range(self.size):
            setattr(self, 'rpc_server%s' % i,
                    ColaRPCServer(('localhost', ports[i])))
            setattr(
                self, 'mq%s' % i,
                MessageQueue(self.nodes[:], self.nodes[i],
                             getattr(self, 'rpc_server%s' % i)))
            getattr(self, 'mq%s' % i).init_store(self.dirs[2 * i],
                                                 self.dirs[2 * i + 1])
            thd = threading.Thread(target=getattr(self, 'rpc_server%s' %
                                                  i).serve_forever)
            thd.setDaemon(True)
            thd.start()

        self.client = MessageQueueClient(self.nodes)
Beispiel #18
0
class BasicWorkerJobLoader(JobLoader):
    def __init__(self, job, data_dir, context=None, logger=None,
                 local=None, nodes=None, copies=1, force=False):
        self.job = job
        ctx = context or self.job.context
        
        self.local = local
        if self.local is None:
            host, port = get_ip(), ctx.job.port
            self.local = '%s:%s' % (host, port)
        else:
            host, port = tuple(self.local.split(':', 1))
        self.nodes = nodes
        if self.nodes is None:
            self.nodes = [self.local]
            
        self.logger = logger
        self.info_logger = get_logger(
            name='cola_worker_info_%s'%self.job.real_name)
            
        super(BasicWorkerJobLoader, self).__init__(
            self.job, data_dir, self.local, 
            context=ctx, copies=copies, force=force)
        
        # instances count that run at the same time
        self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1)
        # excecutings
        self.executings = []
        # exception times that continously throw
        self.error_times = 0
        # budget
        self.budget = 0
        
        # counter
        self.pages_size = 0
        
        # lock when not stopped
        self.stop_lock = threading.Lock()
        self.stop_lock.acquire()
        
        self.check()
        # init rpc server
        self.init_rpc_server()
        # init message queue
        self.init_mq()
        
        # register signal
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)
        
        self.rpc_server.register_function(self.stop, name='stop')
        self.rpc_server.register_function(self.add_node, name='add_node')
        self.rpc_server.register_function(self.remove_node, name='remove_node')
        self.rpc_server.register_function(self.run, name='run')
        self.rpc_server.register_function(self.pages, name='pages')
            
    def _init_bloom_filter(self):
        size = self.job.context.job.size
        base = 1 if not self.job.is_bundle else 1000 
        bloom_filter_file = os.path.join(self.root, 'bloomfilter')
        
        if not os.path.exists(bloom_filter_file):
            if size > 0:
                bloom_filter_size = size*10*base
            else:
                bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY
        else:
            if size > 0:
                bloom_filter_size = size*2*base
            else:
                bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY
        return FileBloomFilter(bloom_filter_file, bloom_filter_size)
            
    def init_mq(self):
        mq_store_dir = os.path.join(self.root, 'store')
        mq_backup_dir = os.path.join(self.root, 'backup')
        if not os.path.exists(mq_store_dir):
            os.makedirs(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.makedirs(mq_backup_dir)
            
        self.mq = MessageQueue(self.nodes, self.local, self.rpc_server,
            copies=self.copies)
        self.mq.init_store(mq_store_dir, mq_backup_dir,
                           verify_exists_hook=self._init_bloom_filter())

        self.redismq = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)


    
    def _release_stop_lock(self):
        try:
            self.stop_lock.release()
        except:
            pass
        
    def check(self):
        env_legal = self.check_env(force=self.force)
        if not env_legal:
            raise JobWorkerRunning('There has been a running job worker.')
        
    def finish(self):
        if self.logger is not None:
            self.logger.info('Finish visiting pages count: %s' % self.pages_size)
        self.stopped = True
        self.mq.shutdown()
        try:
            for handler in self.logger.handlers:
                handler.close()
        finally:
            super(BasicWorkerJobLoader, self).finish()
        
    def complete(self, obj):
        if self.logger is not None:
            self.logger.info('Finish %s' % obj)
        if obj in self.executings:
            self.executings.remove(obj)
        
        if self.ctx.job.size <= 0:
            return True
        return False
            
    def error(self, obj):
        if obj in self.executings:
            self.executings.remove(obj)
        
    def stop(self):
        try:
            # self.mq.put(self.executings, force=True)
            self.redismq.rpush(REDIS_UID, *self.executings)
            super(BasicWorkerJobLoader, self).stop()
        finally:
            self._release_stop_lock()
        
    def signal_handler(self, signum, frame):
        self.stop()
        
    def _login(self, opener):
        if self.job.login_hook is not None:
            if 'login' not in self.ctx.job or \
                not isinstance(self.ctx.job.login, list):
                # raise ConfigurationError('If login_hook set, config files must contains `login`')
                setattr(self.ctx.job, 'login', [])
            #get a new account from redis
            account = json.loads(self.redismq.blpop(REDIS_WEIBO_ACCOUNT)[1])
            self.ctx.job.login.append(account)
            kw = random.choice(self.ctx.job.login)
            login_result = self.job.login_hook(opener, **kw)
            if isinstance(login_result, tuple) and len(login_result) == 2:
                self.logger.error('login fail, reason: %s' % login_result[1])
                return login_result[0]
            elif not login_result:
                self.logger.error('login fail')
            return login_result
        return True
        
    def _log_error(self, obj, err):
        if self.logger is not None:
            self.logger.error('Error when get bundle: %s' % obj)
            self.logger.exception(err)
            
        if self.job.debug:
            raise err
        
    def _require_budget(self, count):
        raise NotImplementedError
    
    def pages(self):
        return self.pages_size
    
    def apply(self):
        raise NotImplementedError
    
    def _execute_bundle(self, obj, opener=None):
        bundle = self.job.unit_cls(obj)
        urls = bundle.urls()
        
        url = None
        try:
            while len(urls) > 0 and not self.stopped:
                url = urls.pop(0)
                self.info_logger.info('get %s url: %s' % (bundle.label, url))

                try:
                    parser_cls, options = self.job.url_patterns.get_parser(url, options=True)
                except TypeError:
                    continue
                if parser_cls is not None:
                    self._require_budget()
                    self.pages_size += 1
                    next_urls, bundles = parser_cls(opener, url, bundle=bundle, logger=self.logger,
                                                    **options).parse()
                    next_urls = list(self.job.url_patterns.matches(next_urls))
                    next_urls.extend(urls)
                    urls = next_urls
                    if bundles:
                        # self.mq.put([str(b) for b in bundles if b.force is False])
                        # self.mq.put([str(b) for b in bundles if b.force is True], force=True)
                        self.redismq.rpush(REDIS_UID, *[str(b) for b in bundles if b.force is False])
                        # self.redismq.rpush(REDIS_KEY, [str(b) for b in bundles if b.force is True])
                    if hasattr(opener, 'close'):
                        opener.close()
            self.error_times = 0
        except LoginFailure, e:
            if not self._login(opener):
                self.error_times += 1
                self._log_error(obj, e)
                self.error(obj)
        except Exception, e:
            self.error_times += 1
            if self.logger is not None and url is not None:
                self.logger.error('Error when fetch url: %s' % url)
            self._log_error(obj, e)
            self.error(obj)
 def __init__(self, nodes, copies=1):
     self.nodes = nodes
     self.hash_ring = HashRing(self.nodes)
     self.copies = max(min(len(self.nodes) - 1, copies), 0)
     self.mq = MessageQueue(nodes, copies=copies)
Beispiel #20
0
class Job(object):
    def __init__(self,
                 ctx,
                 job_def_path,
                 job_name,
                 job_desc=None,
                 working_dir=None,
                 rpc_server=None,
                 manager=None,
                 job_offset=0):
        self.status = NOTSTARTED
        self.ctx = ctx
        self.shutdown_callbacks = []

        self.stopped = multiprocessing.Event()
        self.nonsuspend = multiprocessing.Event()
        self.nonsuspend.set()

        self.job_def_path = job_def_path
        self.job_name = job_name
        self.working_dir = working_dir or os.path.join(self.ctx.working_dir,
                                                       self.job_name)
        self.logger = get_logger(name='cola_job' + str(time.time()))
        self.job_desc = job_desc or import_job_desc(job_def_path)

        self.settings = self.job_desc.settings
        self.is_bundle = self.settings.job.mode == 'bundle'

        self.rpc_server = rpc_server

        self.n_instances = self.job_desc.settings.job.instances
        self.n_containers = min(get_cpu_count(), max(self.n_instances, 1))
        self.job_offset = job_offset
        self.is_multi_process = self.n_containers > 1
        self.processes = []

        self.idle_statuses = manager.list([False] * self.n_containers)

        self.manager = manager

        if not os.path.exists(self.working_dir):
            os.makedirs(self.working_dir)
        self.inited = False
        self._register_rpc()

    def _register_rpc(self):
        if self.rpc_server:
            self.prefix = get_rpc_prefix(app_name=self.job_name, prefix='job')
            self.rpc_server.register_function(self.shutdown,
                                              name='shutdown',
                                              prefix=self.prefix)
            if self.ctx.is_local_mode:
                self.rpc_server.register_function(lambda: [
                    self.job_name,
                ],
                                                  name='get_jobs')

    def init_deduper(self):
        deduper_cls = import_module(self.settings.job.components.deduper.cls)

        base = 1 if not self.is_bundle else 1000
        size = self.job_desc.settings.job.size
        capacity = UNLIMIT_BLOOM_FILTER_CAPACITY
        if size > 0:
            capacity = max(base * size * 10, capacity)

        params = dict(self.settings.job.components.deduper)
        del params['cls']

        deduper_cls = deduper_cls if not self.is_multi_process \
                        else getattr(self.manager, deduper_cls.__name__)
        self.deduper = deduper_cls(self.working_dir, capacity, **params)
        # register shutdown callback
        self.shutdown_callbacks.append(self.deduper.shutdown)

    def init_mq(self):
        mq_dir = os.path.join(self.working_dir, 'mq')
        copies = self.job_desc.settings.job.copies
        n_priorities = self.job_desc.settings.job.priorities

        kw = {
            'app_name': self.job_name,
            'copies': copies,
            'n_priorities': n_priorities,
            'deduper': self.deduper
        }
        self.mq = MessageQueue(mq_dir, self.rpc_server, self.ctx.worker_addr,
                               self.ctx.addrs[:], **kw)
        # register shutdown callback
        self.shutdown_callbacks.append(self.mq.shutdown)

    def _init_function_servers(self):
        budget_dir = os.path.join(self.working_dir, 'budget')
        budget_cls =  BudgetApplyServer if not self.is_multi_process \
                        else self.manager.budget_server
        self.budget_server = budget_cls(budget_dir, self.settings, None,
                                        self.job_name)
        if self.rpc_server:
            BudgetApplyServer.register_rpc(self.budget_server,
                                           self.rpc_server,
                                           app_name=self.job_name)
        self.shutdown_callbacks.append(self.budget_server.shutdown)

        counter_dir = os.path.join(self.working_dir, 'counter')
        counter_cls = CounterServer if not self.is_multi_process \
                        else self.manager.counter_server
        self.counter_server = counter_cls(counter_dir, self.settings, None,
                                          self.job_name)
        if self.rpc_server:
            CounterServer.register_rpc(self.counter_server,
                                       self.rpc_server,
                                       app_name=self.job_name)

        self.shutdown_callbacks.append(self.counter_server.shutdown)

        speed_dir = os.path.join(self.working_dir, 'speed')
        speed_cls = SpeedControlServer if not self.is_multi_process \
                        else self.manager.speed_server
        self.speed_server = speed_cls(speed_dir, self.settings, None,
                                      self.job_name, self.counter_server,
                                      self.ctx.ips)
        if self.rpc_server:
            SpeedControlServer.register_rpc(self.speed_server,
                                            self.rpc_server,
                                            app_name=self.job_name)
        self.shutdown_callbacks.append(self.speed_server.shutdown)

    def init_functions(self):
        if self.ctx.is_local_mode:
            self._init_function_servers()
            self.counter_arg = self.counter_server
            self.budget_arg = self.budget_server
            self.speed_arg = self.speed_server
        else:
            self.counter_arg, self.budget_arg, self.speed_arg = \
                tuple([self.ctx.master_addr for _ in range(3)])

    def init(self):
        if self.inited:
            return

        self.lock_file = os.path.join(self.working_dir, 'lock')

        if os.path.exists(self.lock_file):
            raise JobRunning('The job has already started')
        open(self.lock_file, 'w').close()

        self.init_deduper()
        self.init_mq()
        self.init_functions()

        self.inited = True
        self.status = RUNNING

    def run(self, block=False):
        self.init()
        try:
            self.processes = run_containers(
                self.n_containers,
                self.n_instances,
                self.working_dir,
                self.job_def_path,
                self.job_name,
                self.ctx.env,
                self.mq,
                self.counter_arg,
                self.budget_arg,
                self.speed_arg,
                self.stopped,
                self.nonsuspend,
                self.idle_statuses,
                is_multi_process=self.is_multi_process,
                is_local=self.ctx.is_local_mode,
                master_ip=self.ctx.master_ip,
                offset=self.job_offset)
            if block:
                self.wait_for_stop()
        finally:
            if os.path.exists(self.lock_file):
                os.remove(self.lock_file)

    def wait_for_stop(self):
        [process.join() for process in self.processes]

    def stop_running(self):
        if 'main' not in multiprocessing.current_process().name.lower():
            return

        self.stopped.set()
        self.wait_for_stop()

    def clear_running(self):
        if 'main' not in multiprocessing.current_process().name.lower():
            return

        try:
            # output counters
            if self.ctx.is_local_mode:
                self.logger.debug('Counters during running:')
                self.logger.debug(
                    pprint.pformat(self.counter_server.output(), width=1))
            self.logger.debug('Processing shutting down')

            for cb in self.shutdown_callbacks:
                cb()
            if self.ctx.is_local_mode is True and hasattr(self, 'manager'):
                try:
                    self.manager.shutdown()
                except socket.error:
                    pass
            self.status = FINISHED
            self.logger.debug('Shutdown finished')
        finally:
            if os.path.exists(self.lock_file):
                os.remove(self.lock_file)

    def shutdown(self):
        if 'main' not in multiprocessing.current_process().name.lower():
            return

        try:
            self.stop_running()
        finally:
            self.clear_running()

    def get_status(self):
        if self.ctx.is_local_mode and self.status == RUNNING:
            if self.budget_server.get_status() == ALLFINISHED and \
                self.settings.job.inc is False:
                return FINISHED
            if all(list(self.idle_statuses)):
                return IDLE

        return self.status

    def suspend(self):
        self.nonsuspend.clear()

    def resume(self):
        self.nonsuspend.set()

    def add_node(self, node):
        if hasattr(self, 'mq'):
            self.mq.add_node(node)

    def remove_node(self, node):
        if hasattr(self, 'mq'):
            self.mq.remove_node(node)
Beispiel #21
0
class JobLoader(object):
    def __init__(self,
                 job,
                 rpc_server,
                 mq=None,
                 logger=None,
                 master=None,
                 context=None):
        self.job = job
        self.rpc_server = rpc_server
        self.mq = mq
        self.master = master
        self.logger = logger

        # If stop
        self.stopped = False

        self.ctx = context or self.job.context
        self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1)
        self.size = self.ctx.job.size
        self.budget = 0

        # The execute unit
        self.executing = None

        # register signal
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)

        rpc_server.register_function(self.stop, name='stop')
        rpc_server.register_function(self.add_node, name='add_node')
        rpc_server.register_function(self.remove_node, name='remove_node')
        rpc_server.register_function(self.run, name='run')

    def init_mq(self,
                nodes,
                local_node,
                loc,
                verify_exists_hook=None,
                copies=1):
        mq_store_dir = os.path.join(loc, 'store')
        mq_backup_dir = os.path.join(loc, 'backup')
        if not os.path.exists(mq_store_dir):
            os.mkdir(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.mkdir(mq_backup_dir)

        # MQ relative
        self.mq = MessageQueue(nodes,
                               local_node,
                               self.rpc_server,
                               copies=copies)
        self.mq.init_store(mq_store_dir,
                           mq_backup_dir,
                           verify_exists_hook=verify_exists_hook)

    def stop(self):
        self.stopped = True

        if self.executing is not None:
            self.mq.put(self.executing)

        self.finish()

    def signal_handler(self, signum, frame):
        self.stop()

    def complete(self, obj):
        if self.logger is not None:
            self.logger.info('Finish %s' % obj)

        if self.ctx.job.size <= 0:
            return False

        self.executing = None
        if self.master is not None:
            return client_call(self.master, 'complete', obj)
        else:
            self.size -= 1
            # sth to log
            if self.size <= 0:
                self.stopped = True
            return self.stopped

    def finish(self):
        self.mq.shutdown()
        self.stopped = True

    def _require_budget(self):
        if self.master is None or self.ctx.job.limits == 0:
            return

        if self.budget > 0:
            self.budget -= 1
            return

        while self.budget == 0 and not self.stopped:
            self.budget = client_call(self.master, 'require', BUDGET_REQUIRE)

    def _log(self, obj, err):
        if self.logger is not None:
            self.logger.info('Error when get bundle: %s' % obj)
            self.logger.exception(err)

        if self.job.debug:
            raise err

    def _login(self, opener):
        if self.job.login_hook is not None:
            if 'login' not in self.ctx.job or \
                not isinstance(self.ctx.job.login, list):
                raise ConfigurationError(
                    'If login_hook set, config files must contains `login`')
            kw = random.choice(self.ctx.job.login)
            login_success = self.job.login_hook(opener, **kw)
            if not login_success:
                self.logger.info('login fail')
            return login_success

    def _execute(self, obj, opener=None):
        if opener is None:
            opener = self.job.opener_cls()

        if self.job.is_bundle:
            bundle = self.job.unit_cls(obj)
            urls = bundle.urls()

            try:

                while len(urls) > 0 and not self.stopped:
                    url = urls.pop(0)

                    parser_cls = self.job.url_patterns.get_parser(url)
                    if parser_cls is not None:
                        self._require_budget()
                        next_urls, bundles = parser_cls(opener,
                                                        url,
                                                        bundle=bundle).parse()
                        next_urls = list(
                            self.job.url_patterns.matches(next_urls))
                        next_urls.extend(urls)
                        urls = next_urls
                        if bundles:
                            self.mq.put([str(bundle) for bundle in bundles])
            except LoginFailure:
                if not self._login(opener):
                    return
            except Exception, e:
                self._log(obj, e)

        else:
Beispiel #22
0
 def __init__(self, nodes, copies=1):
     self.nodes = nodes
     self.hash_ring = HashRing(self.nodes)
     self.copies = max(min(len(self.nodes)-1, copies), 0)
     self.mq = MessageQueue(nodes, copies=copies)
Beispiel #23
0
class JobLoader(object):
    def __init__(self, job, rpc_server, 
                 mq=None, logger=None, master=None, context=None):
        self.job = job
        self.rpc_server = rpc_server
        self.mq = mq
        self.master = master
        self.logger = logger
        
        # If stop
        self.stopped = False
        
        self.ctx = context or self.job.context
        self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1)
        self.size =self.ctx.job.size
        self.budget = 0
        
        # The execute unit
        self.executing = None
        
        # register signal
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)
        
        rpc_server.register_function(self.stop, name='stop')
        rpc_server.register_function(self.add_node, name='add_node')
        rpc_server.register_function(self.remove_node, name='remove_node')
        rpc_server.register_function(self.run, name='run')
        
    def init_mq(self, nodes, local_node, loc, 
                verify_exists_hook=None, copies=1):
        mq_store_dir = os.path.join(loc, 'store')
        mq_backup_dir = os.path.join(loc, 'backup')
        if not os.path.exists(mq_store_dir):
            os.mkdir(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.mkdir(mq_backup_dir)
        
        # MQ relative
        self.mq = MessageQueue(
            nodes,
            local_node,
            self.rpc_server,
            copies=copies
        )
        self.mq.init_store(mq_store_dir, mq_backup_dir, 
                           verify_exists_hook=verify_exists_hook)
        
    def stop(self):
        self.stopped = True
        
        if self.executing is not None:
            self.mq.put(self.executing)
        
        self.finish()
        
    def signal_handler(self, signum, frame):
        self.stop()
        
    def complete(self, obj):
        if self.logger is not None:
            self.logger.info('Finish %s' % obj)
        
        if self.ctx.job.size <= 0:
            return False
        
        self.executing = None
        if self.master is not None:
            return client_call(self.master, 'complete', obj)
        else:
            self.size -= 1
            # sth to log
            if self.size <= 0:
                self.stopped = True
            return self.stopped
            
    def finish(self):
        self.mq.shutdown()
        self.stopped = True
        
    def _require_budget(self):
        if self.master is None or self.ctx.job.limits == 0:
            return
        
        if self.budget > 0:
            self.budget -= 1
            return
        
        while self.budget == 0 and not self.stopped:
            self.budget = client_call(self.master, 'require', BUDGET_REQUIRE)
            
    def _log(self, obj, err):
        if self.logger is not None:
            self.logger.error('Error when get bundle: %s' % obj)
            self.logger.exception(err)
            
        if self.job.debug:
            raise err
        
    def _login(self, opener):
        if self.job.login_hook is not None:
            if 'login' not in self.ctx.job or \
                not isinstance(self.ctx.job.login, list):
                raise ConfigurationError('If login_hook set, config files must contains `login`')
            kw = random.choice(self.ctx.job.login)
            login_success = self.job.login_hook(opener, **kw)
            if not login_success:
                self.logger.info('login fail')
            return login_success
        
    def _execute(self, obj, opener=None):
        if opener is None:
            opener = self.job.opener_cls()
            
        if self.job.is_bundle:
            bundle = self.job.unit_cls(obj)
            urls = bundle.urls()
            
            try:
                
                while len(urls) > 0 and not self.stopped:
                    url = urls.pop(0)
                    self.logger.info('get %s url: %s' % (bundle.label, url))
                    
                    parser_cls = self.job.url_patterns.get_parser(url)
                    if parser_cls is not None:
                        self._require_budget()
                        next_urls, bundles = parser_cls(opener, url, bundle=bundle).parse()
                        next_urls = list(self.job.url_patterns.matches(next_urls))
                        next_urls.extend(urls)
                        urls = next_urls
                        if bundles:
                            self.mq.put([str(b) for b in bundles])
            except LoginFailure:
                if not self._login(opener):
                    return
            except Exception, e:
                self._log(obj, e)
                
        else:
Beispiel #24
0
class BasicWorkerJobLoader(JobLoader):
    def __init__(self,
                 job,
                 data_dir,
                 context=None,
                 logger=None,
                 local=None,
                 nodes=None,
                 copies=1,
                 force=False):
        self.job = job
        ctx = context or self.job.context

        self.local = local
        if self.local is None:
            host, port = get_ip(), ctx.job.port
            self.local = '%s:%s' % (host, port)
        else:
            host, port = tuple(self.local.split(':', 1))
        self.nodes = nodes
        if self.nodes is None:
            self.nodes = [self.local]

        self.logger = logger
        self.info_logger = get_logger(name='cola_worker_info_%s' %
                                      self.job.real_name)

        super(BasicWorkerJobLoader, self).__init__(self.job,
                                                   data_dir,
                                                   self.local,
                                                   context=ctx,
                                                   copies=copies,
                                                   force=force)

        # instances count that run at the same time
        self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1)
        # excecutings
        self.executings = []
        # exception times that continously throw
        self.error_times = 0
        # budget
        self.budget = 0

        self.check()
        # init rpc server
        self.init_rpc_server()
        # init message queue
        self.init_mq()

        # register signal
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)

        self.rpc_server.register_function(self.stop, name='stop')
        self.rpc_server.register_function(self.add_node, name='add_node')
        self.rpc_server.register_function(self.remove_node, name='remove_node')
        self.rpc_server.register_function(self.run, name='run')

    def _init_bloom_filter(self):
        size = self.job.context.job.size
        base = 1 if not self.job.is_bundle else 1000
        bloom_filter_file = os.path.join(self.root, 'bloomfilter')

        if not os.path.exists(bloom_filter_file):
            if size > 0:
                bloom_filter_size = size * 10 * base
            else:
                bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY
        else:
            if size > 0:
                bloom_filter_size = size * 2 * base
            else:
                bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY
        return FileBloomFilter(bloom_filter_file, bloom_filter_size)

    def init_mq(self):
        mq_store_dir = os.path.join(self.root, 'store')
        mq_backup_dir = os.path.join(self.root, 'backup')
        if not os.path.exists(mq_store_dir):
            os.makedirs(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.makedirs(mq_backup_dir)

        self.mq = MessageQueue(self.nodes,
                               self.local,
                               self.rpc_server,
                               copies=self.copies)
        self.mq.init_store(mq_store_dir,
                           mq_backup_dir,
                           verify_exists_hook=self._init_bloom_filter())

    def check(self):
        env_legal = self.check_env(force=self.force)
        if not env_legal:
            raise JobWorkerRunning('There has been a running job worker.')

    def finish(self):
        self.stopped = True
        self.mq.shutdown()
        try:
            for handler in self.logger.handlers:
                handler.close()
        finally:
            super(BasicWorkerJobLoader, self).finish()

    def complete(self, obj):
        if self.logger is not None:
            self.logger.info('Finish %s' % obj)
        if obj in self.executings:
            self.executings.remove(obj)

        if self.ctx.job.size <= 0:
            return True
        return False

    def error(self, obj):
        if obj in self.executings:
            self.executings.remove(obj)

    def stop(self):
        self.mq.put(self.executings, force=True)
        super(BasicWorkerJobLoader, self).stop()

    def signal_handler(self, signum, frame):
        self.stop()

    def _login(self, opener):
        if self.job.login_hook is not None:
            if 'login' not in self.ctx.job or \
                not isinstance(self.ctx.job.login, list):
                raise ConfigurationError(
                    'If login_hook set, config files must contains `login`')
            kw = random.choice(self.ctx.job.login)
            login_result = self.job.login_hook(opener, **kw)
            if isinstance(login_result, tuple) and len(login_result) == 2:
                self.logger.error('login fail, reason: %s' % login_result[1])
                return login_result[0]
            elif not login_result:
                self.logger.error('login fail')
            return login_result
        return True

    def _log_error(self, obj, err):
        if self.logger is not None:
            self.logger.error('Error when get bundle: %s' % obj)
            self.logger.exception(err)

        if self.job.debug:
            raise err

    def _require_budget(self, count):
        raise NotImplementedError

    def apply(self):
        raise NotImplementedError

    def _execute_bundle(self, obj, opener=None):
        bundle = self.job.unit_cls(obj)
        urls = bundle.urls()

        url = None
        try:
            while len(urls) > 0 and not self.stopped:
                url = urls.pop(0)
                self.info_logger.info('get %s url: %s' % (bundle.label, url))

                parser_cls, options = self.job.url_patterns.get_parser(
                    url, options=True)
                if parser_cls is not None:
                    self._require_budget()
                    next_urls, bundles = parser_cls(opener,
                                                    url,
                                                    bundle=bundle,
                                                    logger=self.logger,
                                                    **options).parse()
                    next_urls = list(self.job.url_patterns.matches(next_urls))
                    next_urls.extend(urls)
                    urls = next_urls
                    if bundles:
                        self.mq.put(
                            [str(b) for b in bundles if b.force is False])
                        self.mq.put(
                            [str(b) for b in bundles if b.force is True],
                            force=True)
                    if hasattr(opener, 'close'):
                        opener.close()

            self.error_times = 0
        except LoginFailure, e:
            if not self._login(opener):
                self.error_times += 1
                self._log_error(obj, e)
                self.error(obj)
        except Exception, e:
            self.error_times += 1
            if self.logger is not None and url is not None:
                self.logger.error('Error when fetch url: %s' % url)
            self._log_error(obj, e)
            self.error(obj)
Beispiel #25
0
class Job(object):
    def __init__(self, ctx, job_def_path, job_name, 
                 job_desc=None, working_dir=None, rpc_server=None,
                 manager=None, job_offset=0):
        self.status = NOTSTARTED
        self.ctx = ctx
        self.shutdown_callbacks = []
        
        self.stopped = multiprocessing.Event()
        self.nonsuspend = multiprocessing.Event()
        self.nonsuspend.set()
        
        self.job_def_path = job_def_path
        self.job_name = job_name
        self.working_dir = working_dir or os.path.join(self.ctx.working_dir, 
                                                       self.job_name)
        self.logger = get_logger(name='cola_job'+str(time.time()))
        self.job_desc = job_desc or import_job_desc(job_def_path)
            
        self.settings = self.job_desc.settings
        self.is_bundle = self.settings.job.mode == 'bundle'
                
        self.rpc_server = rpc_server
        
        self.n_instances = self.job_desc.settings.job.instances
        self.n_containers = min(get_cpu_count(), max(self.n_instances, 1))
        self.job_offset = job_offset
        self.is_multi_process = self.n_containers > 1
        self.processes = []
        
        self.idle_statuses = manager.list([False] * self.n_containers)
            
        self.manager = manager
        
        if not os.path.exists(self.working_dir):
            os.makedirs(self.working_dir)
        self.inited = False
        self._register_rpc()
        
    def _register_rpc(self):
        if self.rpc_server:
            self.prefix = get_rpc_prefix(app_name=self.job_name, 
                                         prefix='job')
            self.rpc_server.register_function(self.shutdown, name='shutdown',
                                              prefix=self.prefix)
            if self.ctx.is_local_mode:
                self.rpc_server.register_function(lambda: [self.job_name, ],
                                                  name='get_jobs')
        
    def init_deduper(self):
        deduper_cls = import_module(self.settings.job.components.deduper.cls)
        
        base = 1 if not self.is_bundle else 1000
        size = self.job_desc.settings.job.size
        capacity = UNLIMIT_BLOOM_FILTER_CAPACITY
        if size > 0:
            capacity = max(base * size * 10, capacity)
            
        params = dict(self.settings.job.components.deduper)
        del params['cls']
        
        deduper_cls = deduper_cls if not self.is_multi_process \
                        else getattr(self.manager, deduper_cls.__name__)
        self.deduper = deduper_cls(self.working_dir, capacity, **params)
        # register shutdown callback
        self.shutdown_callbacks.append(self.deduper.shutdown)
        
    def init_mq(self):
        mq_dir = os.path.join(self.working_dir, 'mq')
        copies = self.job_desc.settings.job.copies
        n_priorities = self.job_desc.settings.job.priorities
        
        kw = {'app_name': self.job_name, 'copies': copies, 
              'n_priorities': n_priorities, 'deduper': self.deduper}
        self.mq = MessageQueue(mq_dir, self.rpc_server, self.ctx.worker_addr, 
            self.ctx.addrs[:], **kw)
        # register shutdown callback
        self.shutdown_callbacks.append(self.mq.shutdown)
        
    def _init_function_servers(self):
        budget_dir = os.path.join(self.working_dir, 'budget')
        budget_cls =  BudgetApplyServer if not self.is_multi_process \
                        else self.manager.budget_server
        self.budget_server = budget_cls(budget_dir, self.settings, 
                                        None, self.job_name)
        if self.rpc_server:
            BudgetApplyServer.register_rpc(self.budget_server, self.rpc_server, 
                                           app_name=self.job_name)
        self.shutdown_callbacks.append(self.budget_server.shutdown)
        
        counter_dir = os.path.join(self.working_dir, 'counter')
        counter_cls = CounterServer if not self.is_multi_process \
                        else self.manager.counter_server
        self.counter_server = counter_cls(counter_dir, self.settings,
                                          None, self.job_name)
        if self.rpc_server:
            CounterServer.register_rpc(self.counter_server, self.rpc_server, 
                                       app_name=self.job_name)
        
        self.shutdown_callbacks.append(self.counter_server.shutdown)
        
        speed_dir = os.path.join(self.working_dir, 'speed')
        speed_cls = SpeedControlServer if not self.is_multi_process \
                        else self.manager.speed_server
        self.speed_server = speed_cls(speed_dir, self.settings,
                                      None, self.job_name,
                                      self.counter_server, self.ctx.ips)
        if self.rpc_server:
            SpeedControlServer.register_rpc(self.speed_server, self.rpc_server, 
                                            app_name=self.job_name)
        self.shutdown_callbacks.append(self.speed_server.shutdown)
        
    def init_functions(self):
        if self.ctx.is_local_mode:
            self._init_function_servers()
            self.counter_arg = self.counter_server
            self.budget_arg = self.budget_server
            self.speed_arg = self.speed_server
        else:
            self.counter_arg, self.budget_arg, self.speed_arg = \
                tuple([self.ctx.master_addr for _ in range(3)]) 
        
    def init(self):
        if self.inited:
            return
        
        self.lock_file = os.path.join(self.working_dir, 'lock')
        
        if os.path.exists(self.lock_file):
            raise JobRunning('The job has already started')
        open(self.lock_file, 'w').close()
        
        self.init_deduper()
        self.init_mq()
        self.init_functions()
        
        self.inited = True
        self.status = RUNNING
        
    def run(self, block=False):
        self.init()
        try:
            self.processes = run_containers(
                self.n_containers, self.n_instances, self.working_dir, 
                self.job_def_path, self.job_name, self.ctx.env, self.mq,
                self.counter_arg, self.budget_arg, self.speed_arg, 
                self.stopped, self.nonsuspend, self.idle_statuses, 
                is_multi_process=self.is_multi_process,
                is_local=self.ctx.is_local_mode, master_ip=self.ctx.master_ip,
                offset=self.job_offset)
            if block:
                self.wait_for_stop()
        finally:
            if os.path.exists(self.lock_file):
                os.remove(self.lock_file)
            
    def wait_for_stop(self):
        [process.join() for process in self.processes]
        
    def stop_running(self):
        if 'main' not in multiprocessing.current_process().name.lower():
            return
        
        self.stopped.set()
        self.wait_for_stop()
        
    def clear_running(self):
        if 'main' not in multiprocessing.current_process().name.lower():
            return
        
        try:
            # output counters
            if self.ctx.is_local_mode:
                self.logger.debug('Counters during running:')
                self.logger.debug(pprint.pformat(self.counter_server.output(), 
                                                 width=1))
            self.logger.debug('Processing shutting down')
            
            for cb in self.shutdown_callbacks:
                cb()
            if self.ctx.is_local_mode is True and hasattr(self, 'manager'):
                try:
                    self.manager.shutdown()
                except socket.error:
                    pass
            self.status = FINISHED
            self.logger.debug('Shutdown finished')
        finally:
            if os.path.exists(self.lock_file):
                os.remove(self.lock_file)

    def shutdown(self):
        if 'main' not in multiprocessing.current_process().name.lower():
            return
        
        try:
            self.stop_running()
        finally:
            self.clear_running()
                    
    def get_status(self):
        if self.ctx.is_local_mode and self.status == RUNNING:
            if self.budget_server.get_status() == ALLFINISHED and \
                self.settings.job.inc is False:
                return FINISHED
            if all(list(self.idle_statuses)):
                return IDLE
        
        return self.status
            
    def suspend(self):
        self.nonsuspend.clear()
        
    def resume(self):
        self.nonsuspend.set()
        
    def add_node(self, node):
        if hasattr(self, 'mq'):
            self.mq.add_node(node)
            
    def remove_node(self, node):
        if hasattr(self, 'mq'):
            self.mq.remove_node(node)
Beispiel #26
0
class BasicWorkerJobLoader(JobLoader):
    def __init__(self, job, data_dir, context=None, logger=None, local=None, nodes=None, copies=1, force=False):
        self.job = job
        ctx = context or self.job.context

        self.local = local
        if self.local is None:
            host, port = get_ip(), ctx.job.port
            self.local = "%s:%s" % (host, port)
        else:
            host, port = tuple(self.local.split(":", 1))
        self.nodes = nodes
        if self.nodes is None:
            self.nodes = [self.local]

        self.logger = logger
        self.info_logger = get_logger(name="cola_worker_info_%s" % self.job.real_name)

        super(BasicWorkerJobLoader, self).__init__(
            self.job, data_dir, self.local, context=ctx, copies=copies, force=force
        )

        # instances count that run at the same time
        self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1)
        # excecutings
        self.executings = []
        # exception times that continously throw
        self.error_times = 0
        # budget
        self.budget = 0

        self.check()
        # init rpc server
        self.init_rpc_server()
        # init message queue
        self.init_mq()

        # register signal
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)

        self.rpc_server.register_function(self.stop, name="stop")
        self.rpc_server.register_function(self.add_node, name="add_node")
        self.rpc_server.register_function(self.remove_node, name="remove_node")
        self.rpc_server.register_function(self.run, name="run")

    def _init_bloom_filter(self):
        size = self.job.context.job.size
        base = 1 if not self.job.is_bundle else 1000
        bloom_filter_file = os.path.join(self.root, "bloomfilter")

        if not os.path.exists(bloom_filter_file):
            if size > 0:
                bloom_filter_size = size * 10 * base
            else:
                bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY
        else:
            if size > 0:
                bloom_filter_size = size * 2 * base
            else:
                bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY
        return FileBloomFilter(bloom_filter_file, bloom_filter_size)

    def init_mq(self):
        mq_store_dir = os.path.join(self.root, "store")
        mq_backup_dir = os.path.join(self.root, "backup")
        if not os.path.exists(mq_store_dir):
            os.makedirs(mq_store_dir)
        if not os.path.exists(mq_backup_dir):
            os.makedirs(mq_backup_dir)

        self.mq = MessageQueue(self.nodes, self.local, self.rpc_server, copies=self.copies)
        self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=self._init_bloom_filter())

    def check(self):
        env_legal = self.check_env(force=self.force)
        if not env_legal:
            raise JobWorkerRunning("There has been a running job worker.")

    def finish(self):
        self.stopped = True
        self.mq.shutdown()
        try:
            for handler in self.logger.handlers:
                handler.close()
        finally:
            super(BasicWorkerJobLoader, self).finish()

    def complete(self, obj):
        if self.logger is not None:
            self.logger.info("Finish %s" % obj)
        if obj in self.executings:
            self.executings.remove(obj)

        if self.ctx.job.size <= 0:
            return True
        return False

    def error(self, obj):
        if obj in self.executings:
            self.executings.remove(obj)

    def stop(self):
        self.mq.put(self.executings, force=True)
        super(BasicWorkerJobLoader, self).stop()

    def signal_handler(self, signum, frame):
        self.stop()

    def _login(self, opener):
        if self.job.login_hook is not None:
            if "login" not in self.ctx.job or not isinstance(self.ctx.job.login, list):
                raise ConfigurationError("If login_hook set, config files must contains `login`")
            kw = random.choice(self.ctx.job.login)
            login_success = self.job.login_hook(opener, **kw)
            if not login_success:
                self.logger.info("login fail")
            return login_success
        return True

    def _log_error(self, obj, err):
        if self.logger is not None:
            self.logger.error("Error when get bundle: %s" % obj)
            self.logger.exception(err)

        if self.job.debug:
            raise err

    def _require_budget(self, count):
        raise NotImplementedError

    def apply(self):
        raise NotImplementedError

    def _execute_bundle(self, obj, opener=None):
        bundle = self.job.unit_cls(obj)
        urls = bundle.urls()

        try:
            while len(urls) > 0 and not self.stopped:
                url = urls.pop(0)
                self.info_logger.info("get %s url: %s" % (bundle.label, url))

                parser_cls = self.job.url_patterns.get_parser(url)
                if parser_cls is not None:
                    self._require_budget()
                    next_urls, bundles = parser_cls(opener, url, bundle=bundle).parse()
                    next_urls = list(self.job.url_patterns.matches(next_urls))
                    next_urls.extend(urls)
                    urls = next_urls
                    if bundles:
                        self.mq.put([str(b) for b in bundles])

            self.error_times = 0
        except LoginFailure, e:
            if not self._login(opener):
                self.error_times += 1
                self._log_error(obj, e)
                self.error(obj)
        except Exception, e:
            self.error_times += 1
            self._log_error(obj, e)
            self.error(obj)