class Test(unittest.TestCase): def setUp(self): self.dir_ = tempfile.mkdtemp() self.addr = '127.0.0.1' self.addrs = [self.addr, ] def tearDown(self): try: self.mq.shutdown() finally: shutil.rmtree(self.dir_) def testMqProxy(self): self.mq = MessageQueue(self.dir_, None, self.addr, self.addrs, copies=0, n_priorities=1) self.proxy = MpMessageQueueClient(self.mq.new_connection('0')) try: test_obj = Url('http://qinxuye.me') self.proxy.put(test_obj, ) self.assertEqual(self.proxy.get(), test_obj) finally: self.mq.shutdown()
def testAddNode(self): data = range(100) new_port = random.randint(10000, 30000) new_node = 'localhost:%s' % new_port new_rpc_server = ColaRPCServer(('localhost', new_port)) thd = threading.Thread(target=new_rpc_server.serve_forever) thd.setDaemon(True) thd.start() new_dir = tempfile.mkdtemp() ns = list(self.nodes) ns.append(new_node) new_mq = MessageQueue(new_dir, new_rpc_server, new_node, ns) try: self.mq0.add_node(new_node) self.mq1.add_node(new_node) self.mq2.add_node(new_node) self.mq0.put(data) self.assertEqual(data, sorted(self.mq0.get(size=100))) finally: try: new_rpc_server.shutdown() new_mq.shutdown() finally: shutil.rmtree(new_dir)
class Test(unittest.TestCase): def setUp(self): self.dir_ = tempfile.mkdtemp() self.addr = '127.0.0.1' self.addrs = [self.addr, ] def tearDown(self): try: self.mq.shutdown() finally: shutil.rmtree(self.dir_) def testMqProxy(self): self.mq = MessageQueue(self.dir_, None, self.addr, self.addrs, copies=0, n_priorities=1) self.proxy = MpMessageQueueClient(self.mq.new_connection('0')) try: test_obj = Url(u'http://qinxuye.me/三星') self.proxy.put(test_obj, ) self.assertEqual(self.proxy.get(), test_obj) test_obj = u'三星' self.proxy.put(test_obj, ) self.assertEqual(self.proxy.get(), test_obj) finally: self.mq.shutdown()
def testMqProxy(self): self.mq = MessageQueue(self.dir_, None, self.addr, self.addrs, copies=0, n_priorities=1) self.proxy = MpMessageQueueClient(self.mq.new_connection('0')) try: test_obj = Url(u'http://qinxuye.me/三星') self.proxy.put(test_obj, ) self.assertEqual(self.proxy.get(), test_obj) test_obj = u'三星' self.proxy.put(test_obj, ) self.assertEqual(self.proxy.get(), test_obj) finally: self.mq.shutdown()
def init_mq(self): mq_store_dir = os.path.join(self.root, 'store') mq_backup_dir = os.path.join(self.root, 'backup') if not os.path.exists(mq_store_dir): os.makedirs(mq_store_dir) if not os.path.exists(mq_backup_dir): os.makedirs(mq_backup_dir) self.mq = MessageQueue(self.nodes, self.local, self.rpc_server, copies=self.copies) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=self._init_bloom_filter())
def init_mq(self): mq_dir = os.path.join(self.working_dir, 'mq') copies = self.job_desc.settings.job.copies n_priorities = self.job_desc.settings.job.priorities kw = { 'app_name': self.job_name, 'copies': copies, 'n_priorities': n_priorities, 'deduper': self.deduper } self.mq = MessageQueue(mq_dir, self.rpc_server, self.ctx.worker_addr, self.ctx.addrs[:], **kw) # register shutdown callback self.shutdown_callbacks.append(self.mq.shutdown)
class MessageQueueClient(object): def __init__(self, nodes, copies=1): self.nodes = nodes self.hash_ring = HashRing(self.nodes) self.copies = max(min(len(self.nodes) - 1, copies), 0) self.mq = MessageQueue(nodes, copies=copies) def put(self, objs): self.mq.put(objs) def get(self): for n in self.nodes: obj = self.mq._get(n) if obj is not None: return obj
class MessageQueueClient(object): def __init__(self, nodes, copies=1): self.nodes = nodes self.hash_ring = HashRing(self.nodes) self.copies = max(min(len(self.nodes)-1, copies), 0) self.mq = MessageQueue(nodes, copies=copies) def put(self, objs): self.mq.put(objs) def get(self): for n in self.nodes: obj = self.mq._get(n) if obj is not None: return obj
def init_mq(self): mq_store_dir = os.path.join(self.root, "store") mq_backup_dir = os.path.join(self.root, "backup") if not os.path.exists(mq_store_dir): os.makedirs(mq_store_dir) if not os.path.exists(mq_backup_dir): os.makedirs(mq_backup_dir) self.mq = MessageQueue(self.nodes, self.local, self.rpc_server, copies=self.copies) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=self._init_bloom_filter())
def init_mq(self): mq_dir = os.path.join(self.working_dir, 'mq') copies = self.job_desc.settings.job.copies n_priorities = self.job_desc.settings.job.priorities kw = {'app_name': self.job_name, 'copies': copies, 'n_priorities': n_priorities, 'deduper': self.deduper} self.mq = MessageQueue(mq_dir, self.rpc_server, self.ctx.worker_addr, self.ctx.addrs[:], **kw) # register shutdown callback self.shutdown_callbacks.append(self.mq.shutdown)
def testMqProxy(self): self.mq = MessageQueue(self.dir_, None, self.addr, self.addrs, copies=0, n_priorities=1) self.proxy = MpMessageQueueClient(self.mq.new_connection('0')) try: test_obj = Url('http://qinxuye.me') self.proxy.put(test_obj, ) self.assertEqual(self.proxy.get(), test_obj) finally: self.mq.shutdown()
def init_mq(self, nodes, local_node, loc, verify_exists_hook=None, copies=1): mq_store_dir = os.path.join(loc, 'store') mq_backup_dir = os.path.join(loc, 'backup') if not os.path.exists(mq_store_dir): os.mkdir(mq_store_dir) if not os.path.exists(mq_backup_dir): os.mkdir(mq_backup_dir) # MQ relative self.mq = MessageQueue(nodes, local_node, self.rpc_server, copies=copies) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=verify_exists_hook)
def init_mq(self): mq_store_dir = os.path.join(self.root, 'store') mq_backup_dir = os.path.join(self.root, 'backup') if not os.path.exists(mq_store_dir): os.makedirs(mq_store_dir) if not os.path.exists(mq_backup_dir): os.makedirs(mq_backup_dir) self.mq = MessageQueue(self.nodes, self.local, self.rpc_server, copies=self.copies) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=self._init_bloom_filter()) self.redismq = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)
def setUp(self): ports = tuple([random.randint(10000, 30000) for _ in range(3)]) self.nodes = ['localhost:%s'%port for port in ports] self.dirs = [tempfile.mkdtemp() for _ in range(len(ports))] self.size = len(ports) for i in range(self.size): setattr(self, 'rpc_server%s'%i, ColaRPCServer(('localhost', ports[i]))) setattr(self, 'mq%s'%i, MessageQueue(self.dirs[i], getattr(self, 'rpc_server%s'%i), self.nodes[i], self.nodes[:]) ) thd = threading.Thread(target=getattr(self, 'rpc_server%s'%i).serve_forever) thd.setDaemon(True) thd.start() self.client = MessageQueueClient(self.nodes)
def init_mq(self, nodes, local_node, loc, verify_exists_hook=None, copies=1): mq_store_dir = os.path.join(loc, 'store') mq_backup_dir = os.path.join(loc, 'backup') if not os.path.exists(mq_store_dir): os.mkdir(mq_store_dir) if not os.path.exists(mq_backup_dir): os.mkdir(mq_backup_dir) # MQ relative self.mq = MessageQueue( nodes, local_node, self.rpc_server, copies=copies ) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=verify_exists_hook)
def setUp(self): ports = (11111, 11211, 11311) self.nodes = ['localhost:%s' % port for port in ports] self.dirs = [tempfile.mkdtemp() for _ in range(2 * len(ports))] self.size = len(ports) for i in range(self.size): setattr(self, 'rpc_server%s' % i, ColaRPCServer(('localhost', ports[i]))) setattr( self, 'mq%s' % i, MessageQueue(self.nodes[:], self.nodes[i], getattr(self, 'rpc_server%s' % i))) getattr(self, 'mq%s' % i).init_store(self.dirs[2 * i], self.dirs[2 * i + 1]) thd = threading.Thread(target=getattr(self, 'rpc_server%s' % i).serve_forever) thd.setDaemon(True) thd.start() self.client = MessageQueueClient(self.nodes)
class BasicWorkerJobLoader(JobLoader): def __init__(self, job, data_dir, context=None, logger=None, local=None, nodes=None, copies=1, force=False): self.job = job ctx = context or self.job.context self.local = local if self.local is None: host, port = get_ip(), ctx.job.port self.local = '%s:%s' % (host, port) else: host, port = tuple(self.local.split(':', 1)) self.nodes = nodes if self.nodes is None: self.nodes = [self.local] self.logger = logger self.info_logger = get_logger( name='cola_worker_info_%s'%self.job.real_name) super(BasicWorkerJobLoader, self).__init__( self.job, data_dir, self.local, context=ctx, copies=copies, force=force) # instances count that run at the same time self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1) # excecutings self.executings = [] # exception times that continously throw self.error_times = 0 # budget self.budget = 0 # counter self.pages_size = 0 # lock when not stopped self.stop_lock = threading.Lock() self.stop_lock.acquire() self.check() # init rpc server self.init_rpc_server() # init message queue self.init_mq() # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) self.rpc_server.register_function(self.stop, name='stop') self.rpc_server.register_function(self.add_node, name='add_node') self.rpc_server.register_function(self.remove_node, name='remove_node') self.rpc_server.register_function(self.run, name='run') self.rpc_server.register_function(self.pages, name='pages') def _init_bloom_filter(self): size = self.job.context.job.size base = 1 if not self.job.is_bundle else 1000 bloom_filter_file = os.path.join(self.root, 'bloomfilter') if not os.path.exists(bloom_filter_file): if size > 0: bloom_filter_size = size*10*base else: bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY else: if size > 0: bloom_filter_size = size*2*base else: bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY return FileBloomFilter(bloom_filter_file, bloom_filter_size) def init_mq(self): mq_store_dir = os.path.join(self.root, 'store') mq_backup_dir = os.path.join(self.root, 'backup') if not os.path.exists(mq_store_dir): os.makedirs(mq_store_dir) if not os.path.exists(mq_backup_dir): os.makedirs(mq_backup_dir) self.mq = MessageQueue(self.nodes, self.local, self.rpc_server, copies=self.copies) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=self._init_bloom_filter()) self.redismq = redis.Redis(host=REDIS_HOST, port=REDIS_PORT) def _release_stop_lock(self): try: self.stop_lock.release() except: pass def check(self): env_legal = self.check_env(force=self.force) if not env_legal: raise JobWorkerRunning('There has been a running job worker.') def finish(self): if self.logger is not None: self.logger.info('Finish visiting pages count: %s' % self.pages_size) self.stopped = True self.mq.shutdown() try: for handler in self.logger.handlers: handler.close() finally: super(BasicWorkerJobLoader, self).finish() def complete(self, obj): if self.logger is not None: self.logger.info('Finish %s' % obj) if obj in self.executings: self.executings.remove(obj) if self.ctx.job.size <= 0: return True return False def error(self, obj): if obj in self.executings: self.executings.remove(obj) def stop(self): try: # self.mq.put(self.executings, force=True) self.redismq.rpush(REDIS_UID, *self.executings) super(BasicWorkerJobLoader, self).stop() finally: self._release_stop_lock() def signal_handler(self, signum, frame): self.stop() def _login(self, opener): if self.job.login_hook is not None: if 'login' not in self.ctx.job or \ not isinstance(self.ctx.job.login, list): # raise ConfigurationError('If login_hook set, config files must contains `login`') setattr(self.ctx.job, 'login', []) #get a new account from redis account = json.loads(self.redismq.blpop(REDIS_WEIBO_ACCOUNT)[1]) self.ctx.job.login.append(account) kw = random.choice(self.ctx.job.login) login_result = self.job.login_hook(opener, **kw) if isinstance(login_result, tuple) and len(login_result) == 2: self.logger.error('login fail, reason: %s' % login_result[1]) return login_result[0] elif not login_result: self.logger.error('login fail') return login_result return True def _log_error(self, obj, err): if self.logger is not None: self.logger.error('Error when get bundle: %s' % obj) self.logger.exception(err) if self.job.debug: raise err def _require_budget(self, count): raise NotImplementedError def pages(self): return self.pages_size def apply(self): raise NotImplementedError def _execute_bundle(self, obj, opener=None): bundle = self.job.unit_cls(obj) urls = bundle.urls() url = None try: while len(urls) > 0 and not self.stopped: url = urls.pop(0) self.info_logger.info('get %s url: %s' % (bundle.label, url)) try: parser_cls, options = self.job.url_patterns.get_parser(url, options=True) except TypeError: continue if parser_cls is not None: self._require_budget() self.pages_size += 1 next_urls, bundles = parser_cls(opener, url, bundle=bundle, logger=self.logger, **options).parse() next_urls = list(self.job.url_patterns.matches(next_urls)) next_urls.extend(urls) urls = next_urls if bundles: # self.mq.put([str(b) for b in bundles if b.force is False]) # self.mq.put([str(b) for b in bundles if b.force is True], force=True) self.redismq.rpush(REDIS_UID, *[str(b) for b in bundles if b.force is False]) # self.redismq.rpush(REDIS_KEY, [str(b) for b in bundles if b.force is True]) if hasattr(opener, 'close'): opener.close() self.error_times = 0 except LoginFailure, e: if not self._login(opener): self.error_times += 1 self._log_error(obj, e) self.error(obj) except Exception, e: self.error_times += 1 if self.logger is not None and url is not None: self.logger.error('Error when fetch url: %s' % url) self._log_error(obj, e) self.error(obj)
def __init__(self, nodes, copies=1): self.nodes = nodes self.hash_ring = HashRing(self.nodes) self.copies = max(min(len(self.nodes) - 1, copies), 0) self.mq = MessageQueue(nodes, copies=copies)
class Job(object): def __init__(self, ctx, job_def_path, job_name, job_desc=None, working_dir=None, rpc_server=None, manager=None, job_offset=0): self.status = NOTSTARTED self.ctx = ctx self.shutdown_callbacks = [] self.stopped = multiprocessing.Event() self.nonsuspend = multiprocessing.Event() self.nonsuspend.set() self.job_def_path = job_def_path self.job_name = job_name self.working_dir = working_dir or os.path.join(self.ctx.working_dir, self.job_name) self.logger = get_logger(name='cola_job' + str(time.time())) self.job_desc = job_desc or import_job_desc(job_def_path) self.settings = self.job_desc.settings self.is_bundle = self.settings.job.mode == 'bundle' self.rpc_server = rpc_server self.n_instances = self.job_desc.settings.job.instances self.n_containers = min(get_cpu_count(), max(self.n_instances, 1)) self.job_offset = job_offset self.is_multi_process = self.n_containers > 1 self.processes = [] self.idle_statuses = manager.list([False] * self.n_containers) self.manager = manager if not os.path.exists(self.working_dir): os.makedirs(self.working_dir) self.inited = False self._register_rpc() def _register_rpc(self): if self.rpc_server: self.prefix = get_rpc_prefix(app_name=self.job_name, prefix='job') self.rpc_server.register_function(self.shutdown, name='shutdown', prefix=self.prefix) if self.ctx.is_local_mode: self.rpc_server.register_function(lambda: [ self.job_name, ], name='get_jobs') def init_deduper(self): deduper_cls = import_module(self.settings.job.components.deduper.cls) base = 1 if not self.is_bundle else 1000 size = self.job_desc.settings.job.size capacity = UNLIMIT_BLOOM_FILTER_CAPACITY if size > 0: capacity = max(base * size * 10, capacity) params = dict(self.settings.job.components.deduper) del params['cls'] deduper_cls = deduper_cls if not self.is_multi_process \ else getattr(self.manager, deduper_cls.__name__) self.deduper = deduper_cls(self.working_dir, capacity, **params) # register shutdown callback self.shutdown_callbacks.append(self.deduper.shutdown) def init_mq(self): mq_dir = os.path.join(self.working_dir, 'mq') copies = self.job_desc.settings.job.copies n_priorities = self.job_desc.settings.job.priorities kw = { 'app_name': self.job_name, 'copies': copies, 'n_priorities': n_priorities, 'deduper': self.deduper } self.mq = MessageQueue(mq_dir, self.rpc_server, self.ctx.worker_addr, self.ctx.addrs[:], **kw) # register shutdown callback self.shutdown_callbacks.append(self.mq.shutdown) def _init_function_servers(self): budget_dir = os.path.join(self.working_dir, 'budget') budget_cls = BudgetApplyServer if not self.is_multi_process \ else self.manager.budget_server self.budget_server = budget_cls(budget_dir, self.settings, None, self.job_name) if self.rpc_server: BudgetApplyServer.register_rpc(self.budget_server, self.rpc_server, app_name=self.job_name) self.shutdown_callbacks.append(self.budget_server.shutdown) counter_dir = os.path.join(self.working_dir, 'counter') counter_cls = CounterServer if not self.is_multi_process \ else self.manager.counter_server self.counter_server = counter_cls(counter_dir, self.settings, None, self.job_name) if self.rpc_server: CounterServer.register_rpc(self.counter_server, self.rpc_server, app_name=self.job_name) self.shutdown_callbacks.append(self.counter_server.shutdown) speed_dir = os.path.join(self.working_dir, 'speed') speed_cls = SpeedControlServer if not self.is_multi_process \ else self.manager.speed_server self.speed_server = speed_cls(speed_dir, self.settings, None, self.job_name, self.counter_server, self.ctx.ips) if self.rpc_server: SpeedControlServer.register_rpc(self.speed_server, self.rpc_server, app_name=self.job_name) self.shutdown_callbacks.append(self.speed_server.shutdown) def init_functions(self): if self.ctx.is_local_mode: self._init_function_servers() self.counter_arg = self.counter_server self.budget_arg = self.budget_server self.speed_arg = self.speed_server else: self.counter_arg, self.budget_arg, self.speed_arg = \ tuple([self.ctx.master_addr for _ in range(3)]) def init(self): if self.inited: return self.lock_file = os.path.join(self.working_dir, 'lock') if os.path.exists(self.lock_file): raise JobRunning('The job has already started') open(self.lock_file, 'w').close() self.init_deduper() self.init_mq() self.init_functions() self.inited = True self.status = RUNNING def run(self, block=False): self.init() try: self.processes = run_containers( self.n_containers, self.n_instances, self.working_dir, self.job_def_path, self.job_name, self.ctx.env, self.mq, self.counter_arg, self.budget_arg, self.speed_arg, self.stopped, self.nonsuspend, self.idle_statuses, is_multi_process=self.is_multi_process, is_local=self.ctx.is_local_mode, master_ip=self.ctx.master_ip, offset=self.job_offset) if block: self.wait_for_stop() finally: if os.path.exists(self.lock_file): os.remove(self.lock_file) def wait_for_stop(self): [process.join() for process in self.processes] def stop_running(self): if 'main' not in multiprocessing.current_process().name.lower(): return self.stopped.set() self.wait_for_stop() def clear_running(self): if 'main' not in multiprocessing.current_process().name.lower(): return try: # output counters if self.ctx.is_local_mode: self.logger.debug('Counters during running:') self.logger.debug( pprint.pformat(self.counter_server.output(), width=1)) self.logger.debug('Processing shutting down') for cb in self.shutdown_callbacks: cb() if self.ctx.is_local_mode is True and hasattr(self, 'manager'): try: self.manager.shutdown() except socket.error: pass self.status = FINISHED self.logger.debug('Shutdown finished') finally: if os.path.exists(self.lock_file): os.remove(self.lock_file) def shutdown(self): if 'main' not in multiprocessing.current_process().name.lower(): return try: self.stop_running() finally: self.clear_running() def get_status(self): if self.ctx.is_local_mode and self.status == RUNNING: if self.budget_server.get_status() == ALLFINISHED and \ self.settings.job.inc is False: return FINISHED if all(list(self.idle_statuses)): return IDLE return self.status def suspend(self): self.nonsuspend.clear() def resume(self): self.nonsuspend.set() def add_node(self, node): if hasattr(self, 'mq'): self.mq.add_node(node) def remove_node(self, node): if hasattr(self, 'mq'): self.mq.remove_node(node)
class JobLoader(object): def __init__(self, job, rpc_server, mq=None, logger=None, master=None, context=None): self.job = job self.rpc_server = rpc_server self.mq = mq self.master = master self.logger = logger # If stop self.stopped = False self.ctx = context or self.job.context self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1) self.size = self.ctx.job.size self.budget = 0 # The execute unit self.executing = None # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) rpc_server.register_function(self.stop, name='stop') rpc_server.register_function(self.add_node, name='add_node') rpc_server.register_function(self.remove_node, name='remove_node') rpc_server.register_function(self.run, name='run') def init_mq(self, nodes, local_node, loc, verify_exists_hook=None, copies=1): mq_store_dir = os.path.join(loc, 'store') mq_backup_dir = os.path.join(loc, 'backup') if not os.path.exists(mq_store_dir): os.mkdir(mq_store_dir) if not os.path.exists(mq_backup_dir): os.mkdir(mq_backup_dir) # MQ relative self.mq = MessageQueue(nodes, local_node, self.rpc_server, copies=copies) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=verify_exists_hook) def stop(self): self.stopped = True if self.executing is not None: self.mq.put(self.executing) self.finish() def signal_handler(self, signum, frame): self.stop() def complete(self, obj): if self.logger is not None: self.logger.info('Finish %s' % obj) if self.ctx.job.size <= 0: return False self.executing = None if self.master is not None: return client_call(self.master, 'complete', obj) else: self.size -= 1 # sth to log if self.size <= 0: self.stopped = True return self.stopped def finish(self): self.mq.shutdown() self.stopped = True def _require_budget(self): if self.master is None or self.ctx.job.limits == 0: return if self.budget > 0: self.budget -= 1 return while self.budget == 0 and not self.stopped: self.budget = client_call(self.master, 'require', BUDGET_REQUIRE) def _log(self, obj, err): if self.logger is not None: self.logger.info('Error when get bundle: %s' % obj) self.logger.exception(err) if self.job.debug: raise err def _login(self, opener): if self.job.login_hook is not None: if 'login' not in self.ctx.job or \ not isinstance(self.ctx.job.login, list): raise ConfigurationError( 'If login_hook set, config files must contains `login`') kw = random.choice(self.ctx.job.login) login_success = self.job.login_hook(opener, **kw) if not login_success: self.logger.info('login fail') return login_success def _execute(self, obj, opener=None): if opener is None: opener = self.job.opener_cls() if self.job.is_bundle: bundle = self.job.unit_cls(obj) urls = bundle.urls() try: while len(urls) > 0 and not self.stopped: url = urls.pop(0) parser_cls = self.job.url_patterns.get_parser(url) if parser_cls is not None: self._require_budget() next_urls, bundles = parser_cls(opener, url, bundle=bundle).parse() next_urls = list( self.job.url_patterns.matches(next_urls)) next_urls.extend(urls) urls = next_urls if bundles: self.mq.put([str(bundle) for bundle in bundles]) except LoginFailure: if not self._login(opener): return except Exception, e: self._log(obj, e) else:
def __init__(self, nodes, copies=1): self.nodes = nodes self.hash_ring = HashRing(self.nodes) self.copies = max(min(len(self.nodes)-1, copies), 0) self.mq = MessageQueue(nodes, copies=copies)
class JobLoader(object): def __init__(self, job, rpc_server, mq=None, logger=None, master=None, context=None): self.job = job self.rpc_server = rpc_server self.mq = mq self.master = master self.logger = logger # If stop self.stopped = False self.ctx = context or self.job.context self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1) self.size =self.ctx.job.size self.budget = 0 # The execute unit self.executing = None # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) rpc_server.register_function(self.stop, name='stop') rpc_server.register_function(self.add_node, name='add_node') rpc_server.register_function(self.remove_node, name='remove_node') rpc_server.register_function(self.run, name='run') def init_mq(self, nodes, local_node, loc, verify_exists_hook=None, copies=1): mq_store_dir = os.path.join(loc, 'store') mq_backup_dir = os.path.join(loc, 'backup') if not os.path.exists(mq_store_dir): os.mkdir(mq_store_dir) if not os.path.exists(mq_backup_dir): os.mkdir(mq_backup_dir) # MQ relative self.mq = MessageQueue( nodes, local_node, self.rpc_server, copies=copies ) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=verify_exists_hook) def stop(self): self.stopped = True if self.executing is not None: self.mq.put(self.executing) self.finish() def signal_handler(self, signum, frame): self.stop() def complete(self, obj): if self.logger is not None: self.logger.info('Finish %s' % obj) if self.ctx.job.size <= 0: return False self.executing = None if self.master is not None: return client_call(self.master, 'complete', obj) else: self.size -= 1 # sth to log if self.size <= 0: self.stopped = True return self.stopped def finish(self): self.mq.shutdown() self.stopped = True def _require_budget(self): if self.master is None or self.ctx.job.limits == 0: return if self.budget > 0: self.budget -= 1 return while self.budget == 0 and not self.stopped: self.budget = client_call(self.master, 'require', BUDGET_REQUIRE) def _log(self, obj, err): if self.logger is not None: self.logger.error('Error when get bundle: %s' % obj) self.logger.exception(err) if self.job.debug: raise err def _login(self, opener): if self.job.login_hook is not None: if 'login' not in self.ctx.job or \ not isinstance(self.ctx.job.login, list): raise ConfigurationError('If login_hook set, config files must contains `login`') kw = random.choice(self.ctx.job.login) login_success = self.job.login_hook(opener, **kw) if not login_success: self.logger.info('login fail') return login_success def _execute(self, obj, opener=None): if opener is None: opener = self.job.opener_cls() if self.job.is_bundle: bundle = self.job.unit_cls(obj) urls = bundle.urls() try: while len(urls) > 0 and not self.stopped: url = urls.pop(0) self.logger.info('get %s url: %s' % (bundle.label, url)) parser_cls = self.job.url_patterns.get_parser(url) if parser_cls is not None: self._require_budget() next_urls, bundles = parser_cls(opener, url, bundle=bundle).parse() next_urls = list(self.job.url_patterns.matches(next_urls)) next_urls.extend(urls) urls = next_urls if bundles: self.mq.put([str(b) for b in bundles]) except LoginFailure: if not self._login(opener): return except Exception, e: self._log(obj, e) else:
class BasicWorkerJobLoader(JobLoader): def __init__(self, job, data_dir, context=None, logger=None, local=None, nodes=None, copies=1, force=False): self.job = job ctx = context or self.job.context self.local = local if self.local is None: host, port = get_ip(), ctx.job.port self.local = '%s:%s' % (host, port) else: host, port = tuple(self.local.split(':', 1)) self.nodes = nodes if self.nodes is None: self.nodes = [self.local] self.logger = logger self.info_logger = get_logger(name='cola_worker_info_%s' % self.job.real_name) super(BasicWorkerJobLoader, self).__init__(self.job, data_dir, self.local, context=ctx, copies=copies, force=force) # instances count that run at the same time self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1) # excecutings self.executings = [] # exception times that continously throw self.error_times = 0 # budget self.budget = 0 self.check() # init rpc server self.init_rpc_server() # init message queue self.init_mq() # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) self.rpc_server.register_function(self.stop, name='stop') self.rpc_server.register_function(self.add_node, name='add_node') self.rpc_server.register_function(self.remove_node, name='remove_node') self.rpc_server.register_function(self.run, name='run') def _init_bloom_filter(self): size = self.job.context.job.size base = 1 if not self.job.is_bundle else 1000 bloom_filter_file = os.path.join(self.root, 'bloomfilter') if not os.path.exists(bloom_filter_file): if size > 0: bloom_filter_size = size * 10 * base else: bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY else: if size > 0: bloom_filter_size = size * 2 * base else: bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY return FileBloomFilter(bloom_filter_file, bloom_filter_size) def init_mq(self): mq_store_dir = os.path.join(self.root, 'store') mq_backup_dir = os.path.join(self.root, 'backup') if not os.path.exists(mq_store_dir): os.makedirs(mq_store_dir) if not os.path.exists(mq_backup_dir): os.makedirs(mq_backup_dir) self.mq = MessageQueue(self.nodes, self.local, self.rpc_server, copies=self.copies) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=self._init_bloom_filter()) def check(self): env_legal = self.check_env(force=self.force) if not env_legal: raise JobWorkerRunning('There has been a running job worker.') def finish(self): self.stopped = True self.mq.shutdown() try: for handler in self.logger.handlers: handler.close() finally: super(BasicWorkerJobLoader, self).finish() def complete(self, obj): if self.logger is not None: self.logger.info('Finish %s' % obj) if obj in self.executings: self.executings.remove(obj) if self.ctx.job.size <= 0: return True return False def error(self, obj): if obj in self.executings: self.executings.remove(obj) def stop(self): self.mq.put(self.executings, force=True) super(BasicWorkerJobLoader, self).stop() def signal_handler(self, signum, frame): self.stop() def _login(self, opener): if self.job.login_hook is not None: if 'login' not in self.ctx.job or \ not isinstance(self.ctx.job.login, list): raise ConfigurationError( 'If login_hook set, config files must contains `login`') kw = random.choice(self.ctx.job.login) login_result = self.job.login_hook(opener, **kw) if isinstance(login_result, tuple) and len(login_result) == 2: self.logger.error('login fail, reason: %s' % login_result[1]) return login_result[0] elif not login_result: self.logger.error('login fail') return login_result return True def _log_error(self, obj, err): if self.logger is not None: self.logger.error('Error when get bundle: %s' % obj) self.logger.exception(err) if self.job.debug: raise err def _require_budget(self, count): raise NotImplementedError def apply(self): raise NotImplementedError def _execute_bundle(self, obj, opener=None): bundle = self.job.unit_cls(obj) urls = bundle.urls() url = None try: while len(urls) > 0 and not self.stopped: url = urls.pop(0) self.info_logger.info('get %s url: %s' % (bundle.label, url)) parser_cls, options = self.job.url_patterns.get_parser( url, options=True) if parser_cls is not None: self._require_budget() next_urls, bundles = parser_cls(opener, url, bundle=bundle, logger=self.logger, **options).parse() next_urls = list(self.job.url_patterns.matches(next_urls)) next_urls.extend(urls) urls = next_urls if bundles: self.mq.put( [str(b) for b in bundles if b.force is False]) self.mq.put( [str(b) for b in bundles if b.force is True], force=True) if hasattr(opener, 'close'): opener.close() self.error_times = 0 except LoginFailure, e: if not self._login(opener): self.error_times += 1 self._log_error(obj, e) self.error(obj) except Exception, e: self.error_times += 1 if self.logger is not None and url is not None: self.logger.error('Error when fetch url: %s' % url) self._log_error(obj, e) self.error(obj)
class Job(object): def __init__(self, ctx, job_def_path, job_name, job_desc=None, working_dir=None, rpc_server=None, manager=None, job_offset=0): self.status = NOTSTARTED self.ctx = ctx self.shutdown_callbacks = [] self.stopped = multiprocessing.Event() self.nonsuspend = multiprocessing.Event() self.nonsuspend.set() self.job_def_path = job_def_path self.job_name = job_name self.working_dir = working_dir or os.path.join(self.ctx.working_dir, self.job_name) self.logger = get_logger(name='cola_job'+str(time.time())) self.job_desc = job_desc or import_job_desc(job_def_path) self.settings = self.job_desc.settings self.is_bundle = self.settings.job.mode == 'bundle' self.rpc_server = rpc_server self.n_instances = self.job_desc.settings.job.instances self.n_containers = min(get_cpu_count(), max(self.n_instances, 1)) self.job_offset = job_offset self.is_multi_process = self.n_containers > 1 self.processes = [] self.idle_statuses = manager.list([False] * self.n_containers) self.manager = manager if not os.path.exists(self.working_dir): os.makedirs(self.working_dir) self.inited = False self._register_rpc() def _register_rpc(self): if self.rpc_server: self.prefix = get_rpc_prefix(app_name=self.job_name, prefix='job') self.rpc_server.register_function(self.shutdown, name='shutdown', prefix=self.prefix) if self.ctx.is_local_mode: self.rpc_server.register_function(lambda: [self.job_name, ], name='get_jobs') def init_deduper(self): deduper_cls = import_module(self.settings.job.components.deduper.cls) base = 1 if not self.is_bundle else 1000 size = self.job_desc.settings.job.size capacity = UNLIMIT_BLOOM_FILTER_CAPACITY if size > 0: capacity = max(base * size * 10, capacity) params = dict(self.settings.job.components.deduper) del params['cls'] deduper_cls = deduper_cls if not self.is_multi_process \ else getattr(self.manager, deduper_cls.__name__) self.deduper = deduper_cls(self.working_dir, capacity, **params) # register shutdown callback self.shutdown_callbacks.append(self.deduper.shutdown) def init_mq(self): mq_dir = os.path.join(self.working_dir, 'mq') copies = self.job_desc.settings.job.copies n_priorities = self.job_desc.settings.job.priorities kw = {'app_name': self.job_name, 'copies': copies, 'n_priorities': n_priorities, 'deduper': self.deduper} self.mq = MessageQueue(mq_dir, self.rpc_server, self.ctx.worker_addr, self.ctx.addrs[:], **kw) # register shutdown callback self.shutdown_callbacks.append(self.mq.shutdown) def _init_function_servers(self): budget_dir = os.path.join(self.working_dir, 'budget') budget_cls = BudgetApplyServer if not self.is_multi_process \ else self.manager.budget_server self.budget_server = budget_cls(budget_dir, self.settings, None, self.job_name) if self.rpc_server: BudgetApplyServer.register_rpc(self.budget_server, self.rpc_server, app_name=self.job_name) self.shutdown_callbacks.append(self.budget_server.shutdown) counter_dir = os.path.join(self.working_dir, 'counter') counter_cls = CounterServer if not self.is_multi_process \ else self.manager.counter_server self.counter_server = counter_cls(counter_dir, self.settings, None, self.job_name) if self.rpc_server: CounterServer.register_rpc(self.counter_server, self.rpc_server, app_name=self.job_name) self.shutdown_callbacks.append(self.counter_server.shutdown) speed_dir = os.path.join(self.working_dir, 'speed') speed_cls = SpeedControlServer if not self.is_multi_process \ else self.manager.speed_server self.speed_server = speed_cls(speed_dir, self.settings, None, self.job_name, self.counter_server, self.ctx.ips) if self.rpc_server: SpeedControlServer.register_rpc(self.speed_server, self.rpc_server, app_name=self.job_name) self.shutdown_callbacks.append(self.speed_server.shutdown) def init_functions(self): if self.ctx.is_local_mode: self._init_function_servers() self.counter_arg = self.counter_server self.budget_arg = self.budget_server self.speed_arg = self.speed_server else: self.counter_arg, self.budget_arg, self.speed_arg = \ tuple([self.ctx.master_addr for _ in range(3)]) def init(self): if self.inited: return self.lock_file = os.path.join(self.working_dir, 'lock') if os.path.exists(self.lock_file): raise JobRunning('The job has already started') open(self.lock_file, 'w').close() self.init_deduper() self.init_mq() self.init_functions() self.inited = True self.status = RUNNING def run(self, block=False): self.init() try: self.processes = run_containers( self.n_containers, self.n_instances, self.working_dir, self.job_def_path, self.job_name, self.ctx.env, self.mq, self.counter_arg, self.budget_arg, self.speed_arg, self.stopped, self.nonsuspend, self.idle_statuses, is_multi_process=self.is_multi_process, is_local=self.ctx.is_local_mode, master_ip=self.ctx.master_ip, offset=self.job_offset) if block: self.wait_for_stop() finally: if os.path.exists(self.lock_file): os.remove(self.lock_file) def wait_for_stop(self): [process.join() for process in self.processes] def stop_running(self): if 'main' not in multiprocessing.current_process().name.lower(): return self.stopped.set() self.wait_for_stop() def clear_running(self): if 'main' not in multiprocessing.current_process().name.lower(): return try: # output counters if self.ctx.is_local_mode: self.logger.debug('Counters during running:') self.logger.debug(pprint.pformat(self.counter_server.output(), width=1)) self.logger.debug('Processing shutting down') for cb in self.shutdown_callbacks: cb() if self.ctx.is_local_mode is True and hasattr(self, 'manager'): try: self.manager.shutdown() except socket.error: pass self.status = FINISHED self.logger.debug('Shutdown finished') finally: if os.path.exists(self.lock_file): os.remove(self.lock_file) def shutdown(self): if 'main' not in multiprocessing.current_process().name.lower(): return try: self.stop_running() finally: self.clear_running() def get_status(self): if self.ctx.is_local_mode and self.status == RUNNING: if self.budget_server.get_status() == ALLFINISHED and \ self.settings.job.inc is False: return FINISHED if all(list(self.idle_statuses)): return IDLE return self.status def suspend(self): self.nonsuspend.clear() def resume(self): self.nonsuspend.set() def add_node(self, node): if hasattr(self, 'mq'): self.mq.add_node(node) def remove_node(self, node): if hasattr(self, 'mq'): self.mq.remove_node(node)
class BasicWorkerJobLoader(JobLoader): def __init__(self, job, data_dir, context=None, logger=None, local=None, nodes=None, copies=1, force=False): self.job = job ctx = context or self.job.context self.local = local if self.local is None: host, port = get_ip(), ctx.job.port self.local = "%s:%s" % (host, port) else: host, port = tuple(self.local.split(":", 1)) self.nodes = nodes if self.nodes is None: self.nodes = [self.local] self.logger = logger self.info_logger = get_logger(name="cola_worker_info_%s" % self.job.real_name) super(BasicWorkerJobLoader, self).__init__( self.job, data_dir, self.local, context=ctx, copies=copies, force=force ) # instances count that run at the same time self.instances = max(min(self.ctx.job.instances, MAX_THREADS_SIZE), 1) # excecutings self.executings = [] # exception times that continously throw self.error_times = 0 # budget self.budget = 0 self.check() # init rpc server self.init_rpc_server() # init message queue self.init_mq() # register signal signal.signal(signal.SIGINT, self.signal_handler) signal.signal(signal.SIGTERM, self.signal_handler) self.rpc_server.register_function(self.stop, name="stop") self.rpc_server.register_function(self.add_node, name="add_node") self.rpc_server.register_function(self.remove_node, name="remove_node") self.rpc_server.register_function(self.run, name="run") def _init_bloom_filter(self): size = self.job.context.job.size base = 1 if not self.job.is_bundle else 1000 bloom_filter_file = os.path.join(self.root, "bloomfilter") if not os.path.exists(bloom_filter_file): if size > 0: bloom_filter_size = size * 10 * base else: bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY else: if size > 0: bloom_filter_size = size * 2 * base else: bloom_filter_size = UNLIMIT_BLOOM_FILTER_CAPACITY return FileBloomFilter(bloom_filter_file, bloom_filter_size) def init_mq(self): mq_store_dir = os.path.join(self.root, "store") mq_backup_dir = os.path.join(self.root, "backup") if not os.path.exists(mq_store_dir): os.makedirs(mq_store_dir) if not os.path.exists(mq_backup_dir): os.makedirs(mq_backup_dir) self.mq = MessageQueue(self.nodes, self.local, self.rpc_server, copies=self.copies) self.mq.init_store(mq_store_dir, mq_backup_dir, verify_exists_hook=self._init_bloom_filter()) def check(self): env_legal = self.check_env(force=self.force) if not env_legal: raise JobWorkerRunning("There has been a running job worker.") def finish(self): self.stopped = True self.mq.shutdown() try: for handler in self.logger.handlers: handler.close() finally: super(BasicWorkerJobLoader, self).finish() def complete(self, obj): if self.logger is not None: self.logger.info("Finish %s" % obj) if obj in self.executings: self.executings.remove(obj) if self.ctx.job.size <= 0: return True return False def error(self, obj): if obj in self.executings: self.executings.remove(obj) def stop(self): self.mq.put(self.executings, force=True) super(BasicWorkerJobLoader, self).stop() def signal_handler(self, signum, frame): self.stop() def _login(self, opener): if self.job.login_hook is not None: if "login" not in self.ctx.job or not isinstance(self.ctx.job.login, list): raise ConfigurationError("If login_hook set, config files must contains `login`") kw = random.choice(self.ctx.job.login) login_success = self.job.login_hook(opener, **kw) if not login_success: self.logger.info("login fail") return login_success return True def _log_error(self, obj, err): if self.logger is not None: self.logger.error("Error when get bundle: %s" % obj) self.logger.exception(err) if self.job.debug: raise err def _require_budget(self, count): raise NotImplementedError def apply(self): raise NotImplementedError def _execute_bundle(self, obj, opener=None): bundle = self.job.unit_cls(obj) urls = bundle.urls() try: while len(urls) > 0 and not self.stopped: url = urls.pop(0) self.info_logger.info("get %s url: %s" % (bundle.label, url)) parser_cls = self.job.url_patterns.get_parser(url) if parser_cls is not None: self._require_budget() next_urls, bundles = parser_cls(opener, url, bundle=bundle).parse() next_urls = list(self.job.url_patterns.matches(next_urls)) next_urls.extend(urls) urls = next_urls if bundles: self.mq.put([str(b) for b in bundles]) self.error_times = 0 except LoginFailure, e: if not self._login(opener): self.error_times += 1 self._log_error(obj, e) self.error(obj) except Exception, e: self.error_times += 1 self._log_error(obj, e) self.error(obj)