Ejemplo n.º 1
0
 def run_job(self, job_path, overwrite=False, init_rpc=False, settings=None):
     rpc_server = None
     if init_rpc:
         rpc_server = ThreadedColaRPCServer((self.ip, self.port))
         
     if self.is_local_mode:
         self._run_local_job(job_path, overwrite=overwrite, 
                             rpc_server=rpc_server, settings=settings)
     else:
         job_name = import_job_desc(job_path).uniq_name
         
         def create_zip(working_dir):
             zip_dir = os.path.join(self.working_dir, 'zip')
             filename = job_name + '.zip'
             zip_file = os.path.join(zip_dir, filename)
             
             ZipHandler.compress(zip_file, job_path, type_filters=('pyc', ))
             return job_name
         
         if hasattr(self, 'master'):
             create_zip(os.path.join(self.working_dir, 'master'))
             self.master.run_job(job_name, unzip=True)
         elif hasattr(self, 'worker'):
             create_zip(os.path.join(self.working_dir, 'worker'))
             self.worker.prepare(job_name, unzip=True)
             self.worker.run_job(job_name)
Ejemplo n.º 2
0
 def prepare(self, job_name, unzip=True, overwrite=False, 
             settings=None):
     self.logger.debug('entering worker prepare phase, job id: %s' % job_name)
     if unzip:
         self._unzip(job_name)
     
     src_job_name = job_name
     job_path = os.path.join(self.job_dir, job_name)
     
     if not os.path.exists(job_path):
         return False
     
     job_desc = import_job_desc(job_path)
     if settings is not None:
         job_desc.update_settings(settings)
     
     job_id = self.ctx.ips.index(self.ctx.ip)
     clear = job_desc.settings.job.clear \
                 if self.ctx.is_local_mode else False
     job_name, working_dir = self.ctx._get_name_and_dir(
         self.working_dir, job_name, overwrite=overwrite, clear=clear)
     
     job = Job(self.ctx, job_path, job_name, job_desc=job_desc,
               working_dir=working_dir, rpc_server=self.rpc_server,
               manager=self.ctx.manager, job_offset=job_id)
     t = threading.Thread(target=job.run, args=(True, ))
     
     job_info = WorkerJobInfo(job_name, working_dir)
     job_info.job = job
     job_info.thread = t
     self.running_jobs[src_job_name] = job_info
     
     self.logger.debug('worker prepare phase finished, job id: %s' % job_name)
     return True
Ejemplo n.º 3
0
    def run_job(self,
                job_path,
                overwrite=False,
                init_rpc=False,
                settings=None):
        rpc_server = None
        if init_rpc:
            rpc_server = ThreadedColaRPCServer(
                (self.ip, main_conf.worker.port))

        if self.is_local_mode:
            self._run_local_job(job_path,
                                overwrite=overwrite,
                                rpc_server=rpc_server)
        else:
            job_name = import_job_desc(job_path).uniq_name

            def create_zip(working_dir):
                zip_dir = os.path.join(self.working_dir, 'zip')
                filename = job_name + '.zip'
                zip_file = os.path.join(zip_dir, filename)

                ZipHandler.compress(zip_file, job_path, type_filters=('pyc', ))
                return job_name

            if hasattr(self, 'master'):
                create_zip(os.path.join(self.working_dir, 'master'))
                self.master.run_job(job_name, unzip=True)
            elif hasattr(self, 'worker'):
                create_zip(os.path.join(self.working_dir, 'worker'))
                self.worker.prepare(job_name, unzip=True)
                self.worker.run_job(job_name)
Ejemplo n.º 4
0
 def list_runnable_jobs(self):
     job_dirs = filter(lambda s: os.path.isdir(os.path.join(self.job_dir, s)), 
                       os.listdir(self.job_dir))
     
     jobs = {}
     for job_dir in job_dirs:
         desc = import_job_desc(os.path.join(self.job_dir, job_dir))
         jobs[job_dir] = desc.name
     return jobs
Ejemplo n.º 5
0
    def run(self, args):
        master_addr = args.master
        ctx = Context(is_client=True, master_addr=master_addr)

        if args.list is True:
            jobs = ctx.list_jobs()
            self.logger.info('list jobs at master: %s' % ctx.master_addr)
            for job_id, info in jobs.iteritems():
                self.logger.info(
                    '====> job id: %s, job description: %s, status: %s' % \
                    (job_id, info['name'], info['status']))
            if len(jobs) == 0:
                self.logger.info('no jobs exist')
        elif args.kill is not None:
            job_id = self._get_matched_job_name(ctx, args.kill)
            if job_id is not None:
                ctx.kill_job(job_id)
                self.logger.info('killed job: %s' % job_id)
        elif args.upload is not None:
            path = os.path.abspath(args.upload)
            if not os.path.exists(path):
                self.logger.error('upload path does not exist')
                return

            job_id = None
            try:
                job_id = import_job_desc(path).uniq_name
            except Exception, e:
                self.logger.exception(e)
                self.logger.error('uploading job description failed')
                return

            new_upload_dir = os.path.join(tempfile.gettempdir(), job_id)
            if os.path.exists(new_upload_dir):
                shutil.rmtree(new_upload_dir)
            shutil.copytree(path, new_upload_dir)

            temp_filename = os.path.join(tempfile.gettempdir(),
                                         job_id + '.zip')
            ZipHandler.compress(temp_filename,
                                new_upload_dir,
                                type_filters=('pyc', ))
            try:
                FileTransportClient(ctx.master_addr, temp_filename).send_file()
            finally:
                os.remove(temp_filename)
                shutil.rmtree(new_upload_dir)
            self.logger.info('upload job <id: %s> finished' % job_id)

            if args.run == 'U':
                client_call(ctx.master_addr, 'run_job', job_id, True)
                self.logger.info('submit job <id: %s> to the cluster' % job_id)
Ejemplo n.º 6
0
    def __init__(self,
                 container_id,
                 working_dir,
                 job_path,
                 job_name,
                 env,
                 mq,
                 counter_server,
                 budget_server,
                 speed_server,
                 stopped,
                 nonsuspend,
                 idle_statuses,
                 n_tasks=1,
                 is_local=False,
                 master_ip=None,
                 logger=None,
                 task_start_id=0):
        self.container_id = container_id
        self.working_dir = working_dir
        self.mq = mq
        self.env = env
        self.job_name = job_name
        self.job_desc = env['job_desc'].get(job_name) or \
                        import_job_desc(job_path)

        self.counter_server = counter_server
        self.budget_server = budget_server
        self.speed_server = speed_server

        self.stopped = stopped
        self.nonsuspend = nonsuspend
        self.idle_statuses = idle_statuses
        self.n_tasks = n_tasks
        self.is_local = is_local
        self.master_ip = master_ip
        self.logger = logger

        self.task_start_id = task_start_id
        self.ip = self.env.get('ip', None) or get_ip()

        self.counter_clients = [None for _ in range(self.n_tasks)]
        self.budget_clients = [None for _ in range(self.n_tasks)]
        self.speed_clients = [None for _ in range(self.n_tasks)]

        self.task_threads = []

        self.inited = False
        self.lock = multiprocessing.Lock()
Ejemplo n.º 7
0
Archivo: job.py Proyecto: Andelfin/cola
    def run(self, args):
        master_addr = args.master
        ctx = Context(is_client=True, master_addr=master_addr)

        if args.list is True:
            jobs = ctx.list_jobs()
            self.logger.info('list jobs at master: %s' % ctx.master_addr)
            for job_id, info in jobs.iteritems():
                self.logger.info(
                    '====> job id: %s, job description: %s, status: %s' % \
                    (job_id, info['name'], info['status']))
            if len(jobs) == 0:
                self.logger.info('no jobs exist')
        elif args.kill is not None:
            job_id = self._get_matched_job_name(ctx, args.kill)
            if job_id is not None:
                ctx.kill_job(job_id)
                self.logger.info('killed job: %s' % job_id)
        elif args.upload is not None:
            path = os.path.abspath(args.upload)
            if not os.path.exists(path):
                self.logger.error('upload path does not exist')
                return

            job_id = None
            try:
                job_id = import_job_desc(path).uniq_name
            except Exception, e:
                self.logger.exception(e)
                self.logger.error('uploading job description failed')
                return

            new_upload_dir = os.path.join(tempfile.gettempdir(), job_id)
            if os.path.exists(new_upload_dir):
                shutil.rmtree(new_upload_dir)
            shutil.copytree(path, new_upload_dir)

            temp_filename = os.path.join(tempfile.gettempdir(), job_id+'.zip')
            ZipHandler.compress(temp_filename, new_upload_dir, type_filters=('pyc', ))
            try:
                FileTransportClient(ctx.master_addr, temp_filename).send_file()
            finally:
                os.remove(temp_filename)
                shutil.rmtree(new_upload_dir)
            self.logger.info('upload job <id: %s> finished' % job_id)
            
            if args.run == 'U':
                client_call(ctx.master_addr, 'run_job', job_id, True)
                self.logger.info('submit job <id: %s> to the cluster' % job_id)
Ejemplo n.º 8
0
    def __init__(self,
                 ctx,
                 job_def_path,
                 job_name,
                 job_desc=None,
                 working_dir=None,
                 rpc_server=None,
                 manager=None,
                 job_offset=0):
        self.status = NOTSTARTED
        self.ctx = ctx
        self.shutdown_callbacks = []

        self.stopped = multiprocessing.Event()
        self.nonsuspend = multiprocessing.Event()
        self.nonsuspend.set()

        self.job_def_path = job_def_path
        self.job_name = job_name
        self.working_dir = working_dir or os.path.join(self.ctx.working_dir,
                                                       self.job_name)
        self.logger = get_logger(name='cola_job' + str(time.time()))
        self.job_desc = job_desc or import_job_desc(job_def_path)

        self.settings = self.job_desc.settings
        self.is_bundle = self.settings.job.mode == 'bundle'

        self.rpc_server = rpc_server

        self.n_instances = self.job_desc.settings.job.instances
        self.n_containers = min(get_cpu_count(), max(self.n_instances, 1))
        self.job_offset = job_offset
        self.is_multi_process = self.n_containers > 1
        self.processes = []

        self.idle_statuses = manager.list([False] * self.n_containers)

        self.manager = manager

        if not os.path.exists(self.working_dir):
            os.makedirs(self.working_dir)
        self.inited = False
        self._register_rpc()
Ejemplo n.º 9
0
class Test(unittest.TestCase):


    def setUp(self):
        self.working_dir = tempfile.mkdtemp() 
        self.job_dir = os.path.join(self.working_dir, 'master', 'jobs')
        self.zip_dir = os.path.join(self.working_dir, 'master', 'zip')
        
        if not os.path.exists(self.zip_dir):
            os.makedirs(self.zip_dir)
        
        wiki_path = os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 
            'app', 'wiki')
        try:
            shutil.copytree(wiki_path, os.path.join(self.job_dir, 'wiki'))
        except OSError, e:
            if e.errno == errno.ENOTDIR:
                shutil.copy(wiki_path, os.path.join(self.job_dir, 'wiki'))
            else:
                raise
        
        self.job_name = import_job_desc(wiki_path).uniq_name
        old_wiki_path = os.path.join(self.job_dir, 'wiki')
        new_wiki_path = os.path.join(self.job_dir, self.job_name)
        os.rename(old_wiki_path, new_wiki_path)
        
        ZipHandler.compress(os.path.join(self.zip_dir, self.job_name+'.zip'), 
                            new_wiki_path)
        
        config_file = os.path.join(new_wiki_path, 'wiki.yaml')
        try:
            os.remove(os.path.join(new_wiki_path, 'test.yaml'))
        except:
            pass
        
        with open(config_file) as f:
            yaml_obj = yaml.load(f)
            yaml_obj['job']['size'] = 5
            yaml_obj['job']['instances'] = 1
            yaml_obj['job']['priorities'] = 1
        with open(config_file, 'w') as f:
            yaml.dump(yaml_obj, f)
Ejemplo n.º 10
0
    def run_job(self, job_name, unzip=False, 
                wait_for_workers=False):
        if wait_for_workers:
            while not self.stopped.is_set():
                if len(self.worker_tracker.workers) > 0:
                    break
                stopped = self.stopped.wait(3)
                if stopped:
                    return

        if unzip:
            self._unzip(job_name)
        
        job_path = os.path.join(self.job_dir, job_name)
        job_desc = import_job_desc(job_path)
        job_master = JobMaster(self.ctx, job_name, job_desc, 
                               self.worker_tracker.workers.keys())
        job_master.init()
        self.job_tracker.register_job(job_name, job_master)
        self._register_runned_job(job_name, job_desc)
        
        zip_file = os.path.join(self.zip_dir, job_name+'.zip')
        for worker in job_master.workers:
            FileTransportClient(worker, zip_file).send_file()
        
        self.logger.debug(
            'entering the master prepare stage, job id: %s' % job_name)
        self.logger.debug(
            'job available workers: %s' % job_master.workers)
        stage = Stage(job_master.workers, 'prepare', logger=self.logger)
        prepared_ok = stage.barrier(True, job_name)
        if not prepared_ok:
            self.logger.error("prepare for running failed")
            return
        
        self.logger.debug(
            'entering the master run_job stage, job id: %s' % job_name)
        stage = Stage(job_master.workers, 'run_job', logger=self.logger)
        run_ok = stage.barrier(True, job_name)
        if not run_ok:
            self.logger.error("run job failed, job id: %s" % job_name)
Ejemplo n.º 11
0
    def run_job(self, job_name, unzip=False, wait_for_workers=False):
        if wait_for_workers:
            while not self.stopped.is_set():
                if len(self.worker_tracker.workers) > 0:
                    break
                stopped = self.stopped.wait(3)
                if stopped:
                    return

        if unzip:
            self._unzip(job_name)

        job_path = os.path.join(self.job_dir, job_name)
        job_desc = import_job_desc(job_path)
        job_master = JobMaster(self.ctx, job_name, job_desc,
                               self.worker_tracker.workers.keys())
        job_master.init()
        self.job_tracker.register_job(job_name, job_master)
        self._register_runned_job(job_name, job_desc)

        zip_file = os.path.join(self.zip_dir, job_name + '.zip')
        for worker in job_master.workers:
            FileTransportClient(worker, zip_file).send_file()

        self.logger.debug('entering the master prepare stage, job id: %s' %
                          job_name)
        self.logger.debug('job available workers: %s' % job_master.workers)
        stage = Stage(job_master.workers, 'prepare')
        prepared_ok = stage.barrier(True, job_name)
        if not prepared_ok:
            self.logger.error("prepare for running failed")
            return

        self.logger.debug('entering the master run_job stage, job id: %s' %
                          job_name)
        stage = Stage(job_master.workers, 'run_job')
        run_ok = stage.barrier(True, job_name)
        if not run_ok:
            self.logger.error("run job failed, job id: %s" % job_name)
Ejemplo n.º 12
0
 def __init__(self, container_id, working_dir, 
              job_path, job_name, env, mq,
              counter_server, budget_server, speed_server,
              stopped, nonsuspend, idle_statuses, n_tasks=1, 
              is_local=False, master_ip=None, logger=None,
              task_start_id=0):
     self.container_id = container_id
     self.working_dir = working_dir
     self.mq = mq
     self.env = env
     self.job_name = job_name
     self.job_desc = env['job_desc'].get(job_name) or \
                     import_job_desc(job_path)
     
     self.counter_server = counter_server
     self.budget_server = budget_server
     self.speed_server = speed_server
     
     self.stopped = stopped
     self.nonsuspend = nonsuspend
     self.idle_statuses = idle_statuses
     self.n_tasks = n_tasks
     self.is_local = is_local
     self.master_ip = master_ip
     self.logger = logger
     
     self.task_start_id = task_start_id
     self.ip = self.env.get('ip', None) or get_ip()
     
     self.counter_clients = [None for _ in range(self.n_tasks)]
     self.budget_clients = [None for _ in range(self.n_tasks)]
     self.speed_clients = [None for _ in range(self.n_tasks)]
     
     self.task_threads = []
     
     self.inited = False
     self.lock = multiprocessing.Lock()
Ejemplo n.º 13
0
 def __init__(self, ctx, job_def_path, job_name, 
              job_desc=None, working_dir=None, rpc_server=None,
              manager=None, job_offset=0):
     self.status = NOTSTARTED
     self.ctx = ctx
     self.shutdown_callbacks = []
     
     self.stopped = multiprocessing.Event()
     self.nonsuspend = multiprocessing.Event()
     self.nonsuspend.set()
     
     self.job_def_path = job_def_path
     self.job_name = job_name
     self.working_dir = working_dir or os.path.join(self.ctx.working_dir, 
                                                    self.job_name)
     self.logger = get_logger(name='cola_job'+str(time.time()))
     self.job_desc = job_desc or import_job_desc(job_def_path)
         
     self.settings = self.job_desc.settings
     self.is_bundle = self.settings.job.mode == 'bundle'
             
     self.rpc_server = rpc_server
     
     self.n_instances = self.job_desc.settings.job.instances
     self.n_containers = min(get_cpu_count(), max(self.n_instances, 1))
     self.job_offset = job_offset
     self.is_multi_process = self.n_containers > 1
     self.processes = []
     
     self.idle_statuses = manager.list([False] * self.n_containers)
         
     self.manager = manager
     
     if not os.path.exists(self.working_dir):
         os.makedirs(self.working_dir)
     self.inited = False
     self._register_rpc()
Ejemplo n.º 14
0
    def _run_local_job(self, job_path, overwrite=False, rpc_server=None, settings=None):
        job_desc = import_job_desc(job_path)
        if settings is not None: job_desc.update_settings(settings)
        base_name = job_desc.uniq_name
        self.env['job_desc'][base_name] = job_desc

        addr_dirname = self.addr.replace('.', '_').replace(':', '_')
        working_dir = os.path.join(self.working_dir, 'worker', addr_dirname)
        clear = job_desc.settings.job.clear
        job_name, working_dir = self._get_name_and_dir(
            working_dir, base_name, overwrite=overwrite, clear=clear)
                    
        clock = Clock()
        job = Job(self, job_path, job_name, job_desc=job_desc,
                  working_dir=working_dir, rpc_server=rpc_server,
                  manager=self.manager)
        t = threading.Thread(target=job.run, args=(True, ))
        t.start()
        
        stopped = multiprocessing.Event()
        def stop(signum, frame):
            if 'main' not in multiprocessing.current_process().name.lower():
                return
            if stopped.is_set():
                return
            else:
                stopped.set()
                
            self.logger.debug("Catch interrupt signal, start to stop")
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()
            
        signal.signal(signal.SIGINT, stop)
        signal.signal(signal.SIGTERM, stop)
        
        idle_times = 0
        while t.is_alive():
            if job.get_status() == FINISHED:
                break
            if job.get_status() == IDLE:
                idle_times += 1
                if idle_times > MAX_IDLE_TIMES:
                    break
            else:
                idle_times = 0
            
            try:
                t.join(5)
            except IOError:
                break
            
        need_shutdown = False
        if not job.stopped.is_set() and job.get_status() == FINISHED:
            self.logger.debug('All objects have been fetched, try to finish job')
            need_shutdown = True
        elif not stopped.is_set() and not t.is_alive():
            need_shutdown = True
        elif not job.stopped.is_set() and job.get_status() == IDLE:
            self.logger.debug('No bundle or url to perform, try to finish job')
            need_shutdown = True
            
        if need_shutdown is True:
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        self.logger.debug('Job id:%s finished, spend %.2f seconds for running' % (
            job_name, clock.clock()))
Ejemplo n.º 15
0
    def _run_local_job(self,
                       job_path,
                       overwrite=False,
                       rpc_server=None,
                       settings=None):
        job_desc = import_job_desc(job_path)
        if settings is not None: job_desc.update_settings(settings)
        base_name = job_desc.uniq_name
        self.env['job_desc'][base_name] = job_desc

        working_dir = os.path.join(self.working_dir, 'worker')
        clear = job_desc.settings.job.clear
        job_name, working_dir = self._get_name_and_dir(working_dir,
                                                       base_name,
                                                       overwrite=overwrite,
                                                       clear=clear)

        clock = Clock()
        job = Job(self,
                  job_path,
                  job_name,
                  job_desc=job_desc,
                  working_dir=working_dir,
                  rpc_server=rpc_server,
                  manager=self.manager)
        t = threading.Thread(target=job.run, args=(True, ))
        t.start()

        stopped = multiprocessing.Event()

        def stop(signum, frame):
            if 'main' not in multiprocessing.current_process().name.lower():
                return
            if stopped.is_set():
                return
            else:
                stopped.set()

            self.logger.debug("Catch interrupt signal, start to stop")
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        signal.signal(signal.SIGINT, stop)
        signal.signal(signal.SIGTERM, stop)

        idle_times = 0
        while t.is_alive():
            if job.get_status() == FINISHED:
                break
            if job.get_status() == IDLE:
                idle_times += 1
                if idle_times > MAX_IDLE_TIMES:
                    break
            else:
                idle_times = 0

            try:
                t.join(5)
            except IOError:
                break

        need_shutdown = False
        if not job.stopped.is_set() and job.get_status() == FINISHED:
            self.logger.debug(
                'All objects have been fetched, try to finish job')
            need_shutdown = True
        elif not stopped.is_set() and not t.is_alive():
            need_shutdown = True
        elif not job.stopped.is_set() and job.get_status() == IDLE:
            self.logger.debug('No bundle or url to perform, try to finish job')
            need_shutdown = True

        if need_shutdown is True:
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        self.logger.debug(
            'Job id:%s finished, spend %.2f seconds for running' %
            (job_name, clock.clock()))