def get_job(): return Job('wikipedia crawler', url_patterns, MechanizeOpener, starts, instances=user_config.job.instances, user_conf=user_config)
def setUp(self): url_patterns = UrlPatterns( Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item', FakeWikiParser)) fake_user_conf = Config(StringIO(user_conf)) self.dir = tempfile.mkdtemp() self.job = Job( 'fake wiki crawler', url_patterns, MechanizeOpener, [ 'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F', ], user_conf=fake_user_conf) local_node = 'localhost:%s' % self.job.context.job.port nodes = [ local_node, ] self.rpc_server = ColaRPCServer( ('localhost', self.job.context.job.port)) self.loader = JobLoader(self.job) self.loader.init_mq(self.rpc_server, nodes, local_node, self.dir) thd = threading.Thread(target=self.rpc_server.serve_forever) thd.setDaemon(True) thd.start()
def prepare(self, job_name, unzip=True, overwrite=False, settings=None): self.logger.debug('entering worker prepare phase, job id: %s' % job_name) if unzip: self._unzip(job_name) src_job_name = job_name job_path = os.path.join(self.job_dir, job_name) if not os.path.exists(job_path): return False job_desc = import_job_desc(job_path) if settings is not None: job_desc.update_settings(settings) job_id = self.ctx.ips.index(self.ctx.ip) clear = job_desc.settings.job.clear \ if self.ctx.is_local_mode else False job_name, working_dir = self.ctx._get_name_and_dir( self.working_dir, job_name, overwrite=overwrite, clear=clear) job = Job(self.ctx, job_path, job_name, job_desc=job_desc, working_dir=working_dir, rpc_server=self.rpc_server, manager=self.ctx.manager, job_offset=job_id) t = threading.Thread(target=job.run, args=(True, )) job_info = WorkerJobInfo(job_name, working_dir) job_info.job = job job_info.thread = t self.running_jobs[src_job_name] = job_info self.logger.debug('worker prepare phase finished, job id: %s' % job_name) return True
def get_job(): return Job('sina weibo crawler', url_patterns, MechanizeOpener, starts, is_bundle=True, unit_cls=WeiboUserBundle, instances=instances, debug=False, user_conf=user_config, login_hook=login_hook)
def setUp(self): self.job = Job('test job', UrlPatterns(), BuiltinOpener, []) self.root = tempfile.mkdtemp() master_root = os.path.join(self.root, 'master') worker_root = os.path.join(self.root, 'worker') os.makedirs(master_root) os.makedirs(worker_root) node = '%s:%s' % (get_ip(), self.job.context.job.port) nodes = [node] master = '%s:%s' % (get_ip(), self.job.context.job.master_port) self.master_loader = MasterJobLoader(self.job, master_root, nodes) self.worker_loader = WorkerJobLoader(self.job, worker_root, master)
def get_job(): urls = [] for pattern in user_config.job.patterns: url_pattern = Url(pattern.regex, pattern.name, GenericParser, store=pattern.store, extract=pattern.extract) urls.append(url_pattern) url_patterns = UrlPatterns(*urls) return Job(user_config.job.name, url_patterns, MechanizeOpener, starts, instances=user_config.job.instances, user_conf=user_config)
def setUp(self): url_patterns = UrlPatterns( Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item', FakeWikiParser)) fake_user_conf = Config(StringIO(user_conf)) self.dir = tempfile.mkdtemp() self.job = Job( 'fake wiki crawler', url_patterns, MechanizeOpener, [ 'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F', ], user_conf=fake_user_conf) self.local_node = 'localhost:%s' % self.job.context.job.port self.nodes = [ self.local_node, ]
def _run_local_job(self, job_path, overwrite=False, rpc_server=None, settings=None): job_desc = import_job_desc(job_path) if settings is not None: job_desc.update_settings(settings) base_name = job_desc.uniq_name self.env['job_desc'][base_name] = job_desc addr_dirname = self.addr.replace('.', '_').replace(':', '_') working_dir = os.path.join(self.working_dir, 'worker', addr_dirname) clear = job_desc.settings.job.clear job_name, working_dir = self._get_name_and_dir( working_dir, base_name, overwrite=overwrite, clear=clear) clock = Clock() job = Job(self, job_path, job_name, job_desc=job_desc, working_dir=working_dir, rpc_server=rpc_server, manager=self.manager) t = threading.Thread(target=job.run, args=(True, )) t.start() stopped = multiprocessing.Event() def stop(signum, frame): if 'main' not in multiprocessing.current_process().name.lower(): return if stopped.is_set(): return else: stopped.set() self.logger.debug("Catch interrupt signal, start to stop") job.shutdown() if rpc_server: rpc_server.shutdown() signal.signal(signal.SIGINT, stop) signal.signal(signal.SIGTERM, stop) idle_times = 0 while t.is_alive(): if job.get_status() == FINISHED: break if job.get_status() == IDLE: idle_times += 1 if idle_times > MAX_IDLE_TIMES: break else: idle_times = 0 try: t.join(5) except IOError: break need_shutdown = False if not job.stopped.is_set() and job.get_status() == FINISHED: self.logger.debug('All objects have been fetched, try to finish job') need_shutdown = True elif not stopped.is_set() and not t.is_alive(): need_shutdown = True elif not job.stopped.is_set() and job.get_status() == IDLE: self.logger.debug('No bundle or url to perform, try to finish job') need_shutdown = True if need_shutdown is True: job.shutdown() if rpc_server: rpc_server.shutdown() self.logger.debug('Job id:%s finished, spend %.2f seconds for running' % ( job_name, clock.clock()))
def get_job(): return Job('weibo search crawler', url_patterns, get_opener, [], is_bundle=True, unit_cls=WeiboSearchBundle, instances=instances, debug=debug, user_conf=user_config, login_hook=login_hook)
def _run_local_job(self, job_path, overwrite=False, rpc_server=None, settings=None): job_desc = import_job_desc(job_path) if settings is not None: job_desc.update_settings(settings) base_name = job_desc.uniq_name self.env['job_desc'][base_name] = job_desc working_dir = os.path.join(self.working_dir, 'worker') clear = job_desc.settings.job.clear job_name, working_dir = self._get_name_and_dir(working_dir, base_name, overwrite=overwrite, clear=clear) clock = Clock() job = Job(self, job_path, job_name, job_desc=job_desc, working_dir=working_dir, rpc_server=rpc_server, manager=self.manager) t = threading.Thread(target=job.run, args=(True, )) t.start() stopped = multiprocessing.Event() def stop(signum, frame): if 'main' not in multiprocessing.current_process().name.lower(): return if stopped.is_set(): return else: stopped.set() self.logger.debug("Catch interrupt signal, start to stop") job.shutdown() if rpc_server: rpc_server.shutdown() signal.signal(signal.SIGINT, stop) signal.signal(signal.SIGTERM, stop) idle_times = 0 while t.is_alive(): if job.get_status() == FINISHED: break if job.get_status() == IDLE: idle_times += 1 if idle_times > MAX_IDLE_TIMES: break else: idle_times = 0 try: t.join(5) except IOError: break need_shutdown = False if not job.stopped.is_set() and job.get_status() == FINISHED: self.logger.debug( 'All objects have been fetched, try to finish job') need_shutdown = True elif not stopped.is_set() and not t.is_alive(): need_shutdown = True elif not job.stopped.is_set() and job.get_status() == IDLE: self.logger.debug('No bundle or url to perform, try to finish job') need_shutdown = True if need_shutdown is True: job.shutdown() if rpc_server: rpc_server.shutdown() self.logger.debug( 'Job id:%s finished, spend %.2f seconds for running' % (job_name, clock.clock()))