Esempio n. 1
0
def load_job(path, nodes, context=None):
    if not os.path.exists(path):
        raise ValueError('Job definition does not exist.')

    job = import_job(path)

    job_name = job.name.replace(' ', '_')
    if job.debug:
        job_name += '_debug'
    holder = os.path.join(root_dir(), 'data', 'master', 'jobs', job_name)
    if not os.path.exists(holder):
        os.makedirs(holder)

    lock_f = os.path.join(holder, 'lock')
    if os.path.exists(lock_f):
        raise JobMasterRunning('There has been a running job master')
    open(lock_f, 'w').close()

    rpc_server = create_rpc_server(job)
    try:
        loader = JobLoader(job, nodes, rpc_server, context=context)
        loader.run()
        # nofify master watcher finishing
        master_watcher = '%s:%s' % (get_ip(), main_conf.master.port)
        client_call(master_watcher, 'finish_job', job.real_name)
    finally:
        os.remove(lock_f)
        rpc_server.shutdown()
Esempio n. 2
0
 def _check_workers(self):
     while not self.stopped.is_set():
         for worker, info in self.worker_tracker.workers.iteritems():
             # if loose connection
             if int(time.time()) - info.last_update \
                 > HEARTBEAT_CHECK_INTERVAL:
                 
                 info.continous_register = 0
                 if info.status == RUNNING:
                     info.status = HANGUP
                 elif info.status == HANGUP:
                     info.status = STOPPED
                     self.black_list.append(worker)
                     
                     for job in self.job_tracker.running_jobs:
                         self.job_tracker.remove_worker(job, worker)
                     
             # if continously connect for more than 10 min
             elif info.continous_register >= CONTINOUS_HEARTBEAT:
                 if info.status != RUNNING:
                     info.status = RUNNING
                 if worker in self.black_list:
                     self.black_list.remove(worker)
                     
                 for job in self.job_tracker.running_jobs:
                     if not client_call(worker, 'has_job'):
                         client_call(worker, 'prepare', job)
                         client_call(worker, 'run_job', job)
                     self.job_tracker.add_worker(job, worker)
             
         self.stopped.wait(HEARTBEAT_CHECK_INTERVAL)
Esempio n. 3
0
 def stop(self):
     for node in self.nodes:
         try:
             client_call(node, 'stop')
         except socket.error:
             pass
     self.stopped = True
Esempio n. 4
0
 def add_node(self, node):
     for node in self.nodes:
         self.logger.debug('add node %s to nodes' % node)
         client_call(node, 'add_node', node, ignore=True)
     self.nodes.append(node)
     self.logger.debug('run new node %s ' % node)
     client_call(node, 'run', ignore=True)
Esempio n. 5
0
def load_job(path, nodes, context=None):
    if not os.path.exists(path):
        raise ValueError('Job definition does not exist.')
        
    job = import_job(path)
    
    job_name = job.name.replace(' ', '_')
    if job.debug:
        job_name += '_debug'
    holder = os.path.join(root_dir(), 'data', 'master', 'jobs', job_name)
    if not os.path.exists(holder):
        os.makedirs(holder)
    
    lock_f = os.path.join(holder, 'lock')
    if os.path.exists(lock_f):
        raise JobMasterRunning('There has been a running job master')
    open(lock_f, 'w').close()
    
    rpc_server = create_rpc_server(job)
    try:
        loader = JobLoader(job, nodes, rpc_server, context=context)
        loader.run()
        # nofify master watcher finishing
        master_watcher = '%s:%s' % (get_ip(), main_conf.master.port)
        client_call(master_watcher, 'finish_job', job.real_name)
    finally:
        os.remove(lock_f)
        rpc_server.shutdown()
Esempio n. 6
0
File: loader.py Progetto: Ganer/cola
 def stop(self):
     for node in self.nodes:
         try:
             client_call(node, 'stop')
         except socket.error:
             pass
     self.finish()
Esempio n. 7
0
    def _check_workers(self):
        while not self.stopped.is_set():
            for worker, info in self.worker_tracker.workers.iteritems():
                # if loose connection
                if int(time.time()) - info.last_update \
                    > HEARTBEAT_CHECK_INTERVAL:

                    info.continous_register = 0
                    if info.status == RUNNING:
                        info.status = HANGUP
                    elif info.status == HANGUP:
                        info.status = STOPPED
                        self.black_list.append(worker)

                        for job in self.job_tracker.running_jobs:
                            self.job_tracker.remove_worker(job, worker)

                # if continously connect for more than 10 min
                elif info.continous_register >= CONTINOUS_HEARTBEAT:
                    if info.status != RUNNING:
                        info.status = RUNNING
                    if worker in self.black_list:
                        self.black_list.remove(worker)

                    for job in self.job_tracker.running_jobs:
                        if not client_call(worker, 'has_job'):
                            client_call(worker, 'prepare', job)
                            client_call(worker, 'run_job', job)
                        self.job_tracker.add_worker(job, worker)

            self.stopped.wait(HEARTBEAT_CHECK_INTERVAL)
Esempio n. 8
0
File: node.py Progetto: ll2088/cola
 def _remote_or_local_batch_put(self, addr, objs):
     if self._check_empty(objs):
         return
     if addr == self.addr_:
         self.mq_node.batch_put(objs)
     else:
         client_call(addr, self.prefix+'batch_put', pickle.dumps(objs))
Esempio n. 9
0
 def clear_job(self, job_name):
     job_name = job_name.replace(' ', '_')
     path = os.path.join(self.job_dir, job_name)
     shutil.rmtree(path)
     
     for watcher in self.nodes_watchers:
         client_call(watcher, 'clear_job')
Esempio n. 10
0
 def finish(self):
     all_pages = self.pages()
     
     self.release_lock(self.ready_lock)
     self.release_lock(self.finish_lock)
     
     LimitionJobLoader.finish(self)
     JobLoader.finish(self)
     self.stop_logger_server()
     
     try:
         for handler in self.logger.handlers:
             handler.close()
     except:
         pass
         
     if self.client is not None:
         rpc_client = '%s:%s' % (
             self.client.split(':')[0], 
             main_conf.client.port
         )
         client_call(rpc_client, 'stop', ignore=True)
         
     self.logger.info('All nodes finishes visiting pages size: %s' % all_pages)
     self.stopped = True
Esempio n. 11
0
 def stop(self):
     # stop all jobs
     for job_name in self.running_jobs.keys():
         self.stop_job(job_name)
         
     for watcher in self.nodes_watchers:
         client_call(watcher, 'stop')
     self.finish()
Esempio n. 12
0
 def add_worker(self, worker):
     if worker in self.workers:
         return
     
     # rpc call the other workers to add this worker
     for node in self.workers:
         client_call(node, 'add_node', worker)
     self.workers.append(worker)
Esempio n. 13
0
File: node.py Progetto: ll2088/cola
 def _remote_or_local_put(self, addr, objs, force=False, priority=0):
     if self._check_empty(objs):
         return
     if addr == self.addr_:
         self.mq_node.put(objs, force=force, priority=priority)
     else:
         client_call(addr, self.prefix+'put', pickle.dumps(objs), 
                     force, priority)
Esempio n. 14
0
    def stop(self):
        # stop all jobs
        for job_name in self.running_jobs.keys():
            self.stop_job(job_name)

        for watcher in self.nodes_watchers:
            client_call(watcher, 'stop', ignore=True)
        self.finish()
Esempio n. 15
0
    def remove_worker(self, worker):
        if worker not in self.workers:
            return

        # rpc call the other workers to remove this worker
        self.workers.remove(worker)
        for node in self.workers:
            client_call(node, 'remove_node', worker)
Esempio n. 16
0
    def add_worker(self, worker):
        if worker in self.workers:
            return

        # rpc call the other workers to add this worker
        for node in self.workers:
            client_call(node, 'add_node', worker)
        self.workers.append(worker)
Esempio n. 17
0
 def remove_worker(self, worker):
     if worker not in self.workers:
         return
     
     # rpc call the other workers to remove this worker
     self.workers.remove(worker)
     for node in self.workers:
         client_call(node, 'remove_node', worker)
Esempio n. 18
0
File: node.py Progetto: ll2088/cola
 def _remote_or_local_put_backup(self, addr, backup_addr, objs, 
                                 force=False):
     if self._check_empty(objs):
         return
     if addr == self.addr_:
         self.mq_node.put_backup(backup_addr, objs, force=force)
     else:
         client_call(addr, self.prefix+'put_backup', backup_addr, 
                     pickle.dumps(objs), force)
Esempio n. 19
0
    def put(self, objs):
        addrs_objs, addrs_backup_objs = \
            self.distributors.distribute(objs)

        for addr, objs in addrs_objs.iteritems():
            client_call(addr, self.prefix + 'batch_put', pickle.dumps(objs))
        for addr, m in addrs_backup_objs.iteritems():
            for b_addr, objs in m.iteritems():
                client_call(addr, self.prefix + 'put_backup', b_addr,
                            pickle.dumps(objs))
Esempio n. 20
0
 def stop(self):
     for watcher in self.nodes_watchers:
         client_call(watcher, 'stop')
     # stop all jobs
     for job_info in self.running_jobs.values():
         try:
             client_call(job_info.job_master, 'stop')
         except socket.error:
             pass
     self.stopped = True
Esempio n. 21
0
 def put(self, objs):
     addrs_objs, addrs_backup_objs = \
         self.distributors.distribute(objs)
     
     for addr, objs in addrs_objs.iteritems():
         client_call(addr, self.prefix+'batch_put', pickle.dumps(objs))
     for addr, m in addrs_backup_objs.iteritems():
         for b_addr, objs in m.iteritems():
             client_call(addr, self.prefix+'put_backup', b_addr, 
                         pickle.dumps(objs))
Esempio n. 22
0
 def action(self, name):
     if name == 'stop all':
         print 'Trying to stop master and all workers.'
         try:
             client_call(self.master, 'stop')
         except socket.error:
             print 'Cannot connect to cola master.'
         else:
             print 'Cola cluster has been shutdown.'
     elif name == 'list jobs':
         print 'Running jobs: '
         for job in client_call(self.master, 'list_jobs'):
             print job
     elif name == 'list workers':
         print 'Cola workers: '
         for worker in client_call(self.master, 'list_workers'):
             print worker
     elif name == 'list job dirs':
         print 'Runnable job dirs: '
         for dir_ in client_call(self.master, 'list_job_dirs'):
             print dir_
     elif name.startswith('run remote job '):
         print 'Remote job will run in background.'
         
         job_dir = name[len('run remote job '):]
         if job_dir not in client_call(self.master, 'list_job_dirs'):
             print 'Remote job dir not exists!'
         else:
             client_call(self.master, 'start_job', job_dir, False)
     elif name.startswith('run local job '):
         print 'Job has been committed and will run in background.'
         
         start = len('run local job ')
         path = name[start:].strip().strip('"').strip("'")
         if not os.path.exists(path):
             print 'Job path not exists!'
         else:
             try:
                 job = import_job(path)
             except (ImportError, AttributeError):
                 print 'Job path is illegal!'
                 return
                 
             dir_ = tempfile.mkdtemp()
             try:
                 zip_filename = os.path.split(path)[1].replace(' ', '_') + '.zip'
                 zip_file = os.path.join(dir_, zip_filename)
                 
                 ZipHandler.compress(zip_file, path, type_filters=("pyc", ))
                 FileTransportClient(self.master, zip_file).send_file()
                 
                 client_call(self.master, 'start_job', zip_filename)
             finally:
                 shutil.rmtree(dir_)
Esempio n. 23
0
 def sync(self):
     with self.lock:
         if isinstance(self.server, basestring):
             client_call(self.server, self.prefix + 'inc_merge',
                         self.inc_counter.container)
             client_call(self.server, self.prefix + 'acc_merge',
                         self.acc_counter.container)
         else:
             self.server.inc_merge(self.inc_counter.container)
             self.server.acc_merge(self.acc_counter.container)
         self.inc_counter.reset()
         self.acc_counter.reset()
Esempio n. 24
0
 def sync(self):
     with self.lock:
         if isinstance(self.server, basestring):
             client_call(self.server, self.prefix+'inc_merge', 
                         self.inc_counter.container)
             client_call(self.server, self.prefix+'acc_merge', 
                         self.acc_counter.container)
         else:
             self.server.inc_merge(self.inc_counter.container)
             self.server.acc_merge(self.acc_counter.container)
         self.inc_counter.reset()
         self.acc_counter.reset()
Esempio n. 25
0
 def run(self):
     self.ready_lock.acquire()
     
     if not self.stopped and len(self.not_registered) == 0:
         self.mq_client.put(self.job.starts)
         for node in self.nodes:
             client_call(node, 'run')
         
     self.finish_lock.acquire()
     
     master_watcher = '%s:%s' % (get_ip(), main_conf.master.port)
     client_call(master_watcher, 'finish_job', self.job.real_name, ignore=True)
Esempio n. 26
0
File: loader.py Progetto: xren/cola
 def run(self):
     self.ready_lock.acquire()
     
     if not self.stopped and len(self.not_registered) == 0:
         self.mq_client.put(self.job.starts)
         for node in self.nodes:
             client_call(node, 'run')
         
     self.finish_lock.acquire()
     
     master_watcher = '%s:%s' % (get_ip(), main_conf.master.port)
     client_call(master_watcher, 'finish_job', self.job.real_name, ignore=True)
Esempio n. 27
0
    def run(self, args):
        master_addr = args.master
        ctx = Context(is_client=True, master_addr=master_addr)

        if args.list is True:
            jobs = ctx.list_jobs()
            self.logger.info('list jobs at master: %s' % ctx.master_addr)
            for job_id, info in jobs.iteritems():
                self.logger.info(
                    '====> job id: %s, job description: %s, status: %s' % \
                    (job_id, info['name'], info['status']))
            if len(jobs) == 0:
                self.logger.info('no jobs exist')
        elif args.kill is not None:
            job_id = self._get_matched_job_name(ctx, args.kill)
            if job_id is not None:
                ctx.kill_job(job_id)
                self.logger.info('killed job: %s' % job_id)
        elif args.upload is not None:
            path = os.path.abspath(args.upload)
            if not os.path.exists(path):
                self.logger.error('upload path does not exist')
                return

            job_id = None
            try:
                job_id = import_job_desc(path).uniq_name
            except Exception, e:
                self.logger.exception(e)
                self.logger.error('uploading job description failed')
                return

            new_upload_dir = os.path.join(tempfile.gettempdir(), job_id)
            if os.path.exists(new_upload_dir):
                shutil.rmtree(new_upload_dir)
            shutil.copytree(path, new_upload_dir)

            temp_filename = os.path.join(tempfile.gettempdir(),
                                         job_id + '.zip')
            ZipHandler.compress(temp_filename,
                                new_upload_dir,
                                type_filters=('pyc', ))
            try:
                FileTransportClient(ctx.master_addr, temp_filename).send_file()
            finally:
                os.remove(temp_filename)
                shutil.rmtree(new_upload_dir)
            self.logger.info('upload job <id: %s> finished' % job_id)

            if args.run == 'U':
                client_call(ctx.master_addr, 'run_job', job_id, True)
                self.logger.info('submit job <id: %s> to the cluster' % job_id)
Esempio n. 28
0
 def stop_job(self, job_real_name):
     if job_real_name not in self.running_jobs:
         return False
     job_info = self.running_jobs[job_real_name]
     
     try:
         client_call(job_info.job_master, 'stop')
     finally:
         for watcher in self.nodes_watchers.keys():
             client_call(watcher, 'kill', job_real_name)
         self.kill(job_real_name)
     
     return True
Esempio n. 29
0
    def stop_job(self, job_real_name):
        if job_real_name not in self.running_jobs:
            return False
        job_info = self.running_jobs[job_real_name]

        try:
            client_call(job_info.job_master, 'stop', ignore=True)
        finally:
            for watcher in self.nodes_watchers.keys():
                client_call(watcher, 'kill', job_real_name, ignore=True)
            self.kill(job_real_name)

        return True
Esempio n. 30
0
def load_job(path, master=None):
    if not os.path.exists(path):
        raise ValueError('Job definition does not exist.')

    job = import_job(path)

    holder = os.path.join(root_dir(), 'data', 'worker', 'jobs', job.real_name)
    mq_holder = os.path.join(holder, 'mq')
    if not os.path.exists(mq_holder):
        os.makedirs(mq_holder)

    # Logger
    logger = get_logger(os.path.join(holder, 'job.log'))

    local_node = '%s:%s' % (get_ip(), job.context.job.port)
    nodes = [local_node]
    if master is not None:
        nodes = client_call(master, 'get_nodes')

    # Bloom filter hook
    bloom_filter_file = os.path.join(holder, 'bloomfilter')
    bloom_filter_hook = create_bloom_filter_hook(bloom_filter_file, job)

    rpc_server = create_rpc_server(job)
    loader = JobLoader(job, rpc_server, logger=logger, master=master)
    loader.init_mq(nodes,
                   local_node,
                   mq_holder,
                   verify_exists_hook=bloom_filter_hook,
                   copies=2 if master else 1)

    if master is None:
        try:
            loader.mq.put(job.starts)
            loader.run()
        finally:
            rpc_server.shutdown()
    else:
        try:
            client_call(master, 'ready', local_node)

            def _start():
                while not loader.stopped:
                    time.sleep(TIME_SLEEP)
                loader.run()

            thread = threading.Thread(target=_start)
            thread.start()
            thread.join()
        finally:
            rpc_server.shutdown()
Esempio n. 31
0
    def start_job(self, zip_filename, uncompress=True, client=None):
        if uncompress:
            zip_file = os.path.join(self.zip_dir, zip_filename)

            # transfer zip file to workers
            for watcher in self.nodes_watchers:
                if watcher.split(':')[0] == self.ip_address:
                    continue
                file_trans_client = FileTransportClient(watcher, zip_file)
                file_trans_client.send_file()

            job_dir = ZipHandler.uncompress(zip_file, self.job_dir)
        else:
            job_dir = os.path.join(self.job_dir,
                                   zip_filename.rsplit('.', 1)[0])

        job = import_job(job_dir)

        worker_port = job.context.job.port
        port = job.context.job.master_port
        nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers]

        if len(nodes) > 0:
            info = MasterJobInfo(port, nodes, worker_port)
            self.running_jobs[job.real_name] = info

            dirname = os.path.dirname(os.path.abspath(__file__))
            f = os.path.join(dirname, 'loader.py')
            workers = ['%s:%s' % (node, worker_port) for node in nodes]

            cmds = [
                'python', f, '-j', job_dir, '-i', self.ip_address, '-n',
                ' '.join(workers)
            ]
            if self.data_path is not None:
                cmds.extend(['-d', self.data_path])
            if self.force:
                cmds.append('-f')
            if client is not None:
                cmds.extend(['-c', client])
            popen = subprocess.Popen(cmds)
            info.popen = popen

            # call workers to start job
            for worker_watcher in self.nodes_watchers:
                client_call(worker_watcher,
                            'start_job',
                            zip_filename,
                            uncompress,
                            ignore=True)
Esempio n. 32
0
def load_job(path, master=None):
    if not os.path.exists(path):
        raise ValueError('Job definition does not exist.')
        
    job = import_job(path)
    
    holder = os.path.join(
        root_dir(), 'data', 'worker', 'jobs', job.real_name)
    mq_holder = os.path.join(holder, 'mq')
    if not os.path.exists(mq_holder):
        os.makedirs(mq_holder)
    
    # Logger
    logger = get_logger(os.path.join(holder, 'job.log'))
    
    local_node = '%s:%s' % (get_ip(), job.context.job.port)
    nodes = [local_node]
    if master is not None:
        nodes = client_call(master, 'get_nodes')
    
    # Bloom filter hook
    bloom_filter_file = os.path.join(holder, 'bloomfilter')
    bloom_filter_hook = create_bloom_filter_hook(bloom_filter_file, job)
    
    rpc_server = create_rpc_server(job)
    loader = JobLoader(job, rpc_server, logger=logger, master=master)
    loader.init_mq(nodes, local_node, mq_holder, 
                   verify_exists_hook=bloom_filter_hook,
                   copies=2 if master else 1)
    
    if master is None:
        try:
            loader.mq.put(job.starts)
            loader.run()
        finally:
            rpc_server.shutdown()
    else:
        try:
            client_call(master, 'ready', local_node)
            
            def _start():
                while not loader.stopped: 
                    time.sleep(TIME_SLEEP)
                loader.run()
            thread = threading.Thread(target=_start)
            thread.start()
            thread.join()
        finally:
            rpc_server.shutdown()
Esempio n. 33
0
File: job.py Progetto: Andelfin/cola
    def run(self, args):
        master_addr = args.master
        ctx = Context(is_client=True, master_addr=master_addr)

        if args.list is True:
            jobs = ctx.list_jobs()
            self.logger.info('list jobs at master: %s' % ctx.master_addr)
            for job_id, info in jobs.iteritems():
                self.logger.info(
                    '====> job id: %s, job description: %s, status: %s' % \
                    (job_id, info['name'], info['status']))
            if len(jobs) == 0:
                self.logger.info('no jobs exist')
        elif args.kill is not None:
            job_id = self._get_matched_job_name(ctx, args.kill)
            if job_id is not None:
                ctx.kill_job(job_id)
                self.logger.info('killed job: %s' % job_id)
        elif args.upload is not None:
            path = os.path.abspath(args.upload)
            if not os.path.exists(path):
                self.logger.error('upload path does not exist')
                return

            job_id = None
            try:
                job_id = import_job_desc(path).uniq_name
            except Exception, e:
                self.logger.exception(e)
                self.logger.error('uploading job description failed')
                return

            new_upload_dir = os.path.join(tempfile.gettempdir(), job_id)
            if os.path.exists(new_upload_dir):
                shutil.rmtree(new_upload_dir)
            shutil.copytree(path, new_upload_dir)

            temp_filename = os.path.join(tempfile.gettempdir(), job_id+'.zip')
            ZipHandler.compress(temp_filename, new_upload_dir, type_filters=('pyc', ))
            try:
                FileTransportClient(ctx.master_addr, temp_filename).send_file()
            finally:
                os.remove(temp_filename)
                shutil.rmtree(new_upload_dir)
            self.logger.info('upload job <id: %s> finished' % job_id)
            
            if args.run == 'U':
                client_call(ctx.master_addr, 'run_job', job_id, True)
                self.logger.info('submit job <id: %s> to the cluster' % job_id)
Esempio n. 34
0
 def get(self, size=1, priority=0):
     size = max(size, 1)
     
     addrs = list(self.addrs)
     shuffle(addrs)
     
     results = []
     for addr in addrs:
         left = size - len(results)
         if left <= 0:
             break
         
         objs = pickle.loads(client_call(addr, self.prefix+'get', 
                                         left, priority))
         if objs is None:
             continue
         if not isinstance(objs, list):
             objs = [objs, ]
         results.extend(objs)
     
     if size == 1:
         if len(results) == 0:
             return
         return results[0]
     return results
Esempio n. 35
0
    def get(self, size=1, priority=0):
        size = max(size, 1)

        addrs = list(self.addrs)
        shuffle(addrs)

        results = []
        for addr in addrs:
            left = size - len(results)
            if left <= 0:
                break

            objs = pickle.loads(
                client_call(addr, self.prefix + 'get', left, priority))
            if objs is None:
                continue
            if not isinstance(objs, list):
                objs = [
                    objs,
                ]
            results.extend(objs)

        if size == 1:
            if len(results) == 0:
                return
            return results[0]
        return results
Esempio n. 36
0
 def require(self, size=1):
     if isinstance(self.server, basestring):
         return client_call(self.server, self.prefix+'require', self.addr, 
                            self.instance_id, size)
     else:
         return self.server.require(self.addr, self.instance_id, 
                                    size=size)
Esempio n. 37
0
File: loader.py Progetto: 52nlp/cola
 def pages(self):
     all_pages = 0
     for node in self.nodes:
         pages = client_call(node, 'pages', ignore=True)
         if pages is not None:
             all_pages += int(pages)
     return all_pages
Esempio n. 38
0
 def _call(i, worker):
     try:
         result = client_call(worker, self.remote_func, *args)
     except Exception, e:
         if self.logger:
             self.logger.error(e)
         result = False
Esempio n. 39
0
File: stop.py Progetto: 0pengl/cola
def _client_call(*args):
    try:
        return client_call(*args)
    except socket.error:
        logger.error('Cannot connect to single running worker.')
    except:
        pass
Esempio n. 40
0
 def _call(i, worker):
     try:
         result = client_call(worker, self.remote_func, *args)
     except Exception, e:
         if self.logger:
             self.logger.error(e)
         result = False
Esempio n. 41
0
 def list_jobs(self):
     jobs = {}
     if self.is_master and self.master is not None:
         runnable_jobs = self.master.list_runnable_jobs()
         running_jobs = self.master.job_tracker.running_jobs
     else:
         runnable_jobs = client_call(self.master_addr, 'runnable_jobs')
         running_jobs = client_call(self.master_addr, 'running_jobs')
     for job_id, job_name in runnable_jobs.iteritems():
         jobs[job_id] = {'name': job_name}
         if job_id in running_jobs:
             jobs[job_id]['status'] = 'running'
         else:
             jobs[job_id]['status'] = 'stopped'
     
     return jobs
Esempio n. 42
0
    def list_jobs(self):
        jobs = {}
        if self.is_master and self.master is not None:
            runnable_jobs = self.master.list_runnable_jobs()
            running_jobs = self.master.job_tracker.running_jobs
        else:
            runnable_jobs = client_call(self.master_addr, 'runnable_jobs')
            running_jobs = client_call(self.master_addr, 'running_jobs')
        for job_id, job_name in runnable_jobs.iteritems():
            jobs[job_id] = {'name': job_name}
            if job_id in running_jobs:
                jobs[job_id]['status'] = 'running'
            else:
                jobs[job_id]['status'] = 'stopped'

        return jobs
Esempio n. 43
0
def _client_call(*args):
    try:
        return client_call(*args)
    except socket.error:
        logger.error('Cannot connect to single running worker.')
    except:
        pass
Esempio n. 44
0
 def pages(self):
     all_pages = 0
     for node in self.nodes:
         self.logger.debug('get pages from node %s' % node)
         pages = client_call(node, 'pages', ignore=True)
         if pages is not None:
             all_pages += int(pages)
     return all_pages
Esempio n. 45
0
 def _report():
     while not self.stopped.is_set():
         workers = client_call(self.master, 'register_heartbeat', 
                               self.ctx.worker_addr)
         self.ctx.addrs = [self.ctx.fix_addr(worker) for worker in workers]
         self.ctx.ips = [self.ctx.fix_ip(worker) for worker in workers]
                         
         self.stopped.wait(HEARTBEAT_INTERVAL)
Esempio n. 46
0
 def run(self):
     # wait until all the workers initialized
     while not self.is_ready: pass
     
     if self.limit_speed:
         self._in_minute_clear()
         
     self.mq_client.put(self.job.starts)
     for node in self.nodes:
         client_call(node, 'run')
     
     def _run():
         while not self.stopped:
             time.sleep(TIME_SLEEP)
     main_thread = threading.Thread(target=_run)
     main_thread.start()
     main_thread.join()
Esempio n. 47
0
    def run(self):
        # wait until all the workers initialized
        while not self.is_ready:
            pass

        if self.limit_speed:
            self._in_minute_clear()

        self.mq_client.put(self.job.starts)
        for node in self.nodes:
            client_call(node, 'run')

        def _run():
            while not self.stopped:
                time.sleep(TIME_SLEEP)

        main_thread = threading.Thread(target=_run)
        main_thread.start()
        main_thread.join()
Esempio n. 48
0
 def get_job_counter(self, job_id):
     if self.is_master and self.master is not None:
         return self.master.counter_server.output()
     else:
         from cola.functions.counter import FUNC_PREFIX
         from cola.core.utils import get_rpc_prefix
         
         func_name = '%s%s' % (get_rpc_prefix(job_id, FUNC_PREFIX), 'get_global')
         
         return client_call(self.master_addr, func_name)
Esempio n. 49
0
 def get_job_counter(self, job_id):
     if self.is_master and self.master is not None:
         return self.master.counter_server.output()
     else:
         from cola.functions.counter import FUNC_PREFIX
         from cola.core.utils import get_rpc_prefix
         
         func_name = '%s%s' % (get_rpc_prefix(job_id, FUNC_PREFIX), 'get_global')
         
         return client_call(self.master_addr, func_name)
Esempio n. 50
0
    def _require_budget(self):
        if self.master is None or self.ctx.job.limits == 0:
            return

        if self.budget > 0:
            self.budget -= 1
            return

        while self.budget == 0 and not self.stopped:
            self.budget = client_call(self.master, 'require', BUDGET_REQUIRE)
Esempio n. 51
0
    def finish(self):
        self.release_lock(self.ready_lock)
        self.release_lock(self.finish_lock)

        LimitionJobLoader.finish(self)
        JobLoader.finish(self)
        self.stop_logger_server()

        try:
            for handler in self.logger.handlers:
                handler.close()
        except:
            pass

        if self.client is not None:
            rpc_client = '%s:%s' % (self.client.split(':')[0],
                                    main_conf.client.port)
            client_call(rpc_client, 'stop', ignore=True)

        self.stopped = True
Esempio n. 52
0
def load_job(job_path, data_path=None, master=None, force=False):
    if not os.path.exists(job_path):
        raise ValueError('Job definition does not exist.')

    job = import_job(job_path)

    if data_path is None:
        data_path = os.path.join(root_dir(), 'data')
    root = os.path.join(data_path, 'worker', 'jobs', job.real_name)
    if not os.path.exists(root):
        os.makedirs(root)

    if master is None:
        with StandaloneWorkerJobLoader(job, root, force=force) as job_loader:
            job_loader.run()
    else:
        nodes = client_call(master, 'get_nodes')
        local = '%s:%s' % (get_ip(), job.context.job.port)
        client_call(master, 'ready', local)
        with WorkerJobLoader(job, root, master, local=local, nodes=nodes, force=force) \
            as job_loader:
            client_call(master, 'ready', local)
            job_loader.ready_for_run()
Esempio n. 53
0
 def start_job(self, zip_filename, uncompress=True):
     if uncompress:
         zip_file = os.path.join(self.zip_dir, zip_filename)
         
         # transfer zip file to workers
         for watcher in self.nodes_watchers:
             if watcher.split(':')[0] == self.ip_address:
                 continue
             file_trans_client = FileTransportClient(watcher, zip_file)
             file_trans_client.send_file()
         
         job_dir = ZipHandler.uncompress(zip_file, self.job_dir)
     else:
         job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0])
         
     job = import_job(job_dir)
     
     worker_port = job.context.job.port
     port = job.context.job.master_port
     nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers]
     
     if len(nodes) > 0:
         info = MasterJobInfo(port, nodes, worker_port)
         self.running_jobs[job.real_name] = info
         
         dirname = os.path.dirname(os.path.abspath(__file__))
         f = os.path.join(dirname, 'loader.py')
         workers = ['%s:%s'%(node, worker_port) for node in nodes]
         subprocess.Popen('python "%(py)s" "%(job_dir)s" %(nodes)s' % {
             'py': f,
             'job_dir': job_dir,
             'nodes': ' '.join(workers)
         })
         
         # call workers to start job
         for worker_watcher in self.nodes_watchers:
             client_call(worker_watcher, 'start_job', zip_filename, uncompress)
Esempio n. 54
0
 def _remote_or_local_get(self, addr, size=1, priority=0):
     objs = None
     if addr == self.addr_:
         objs = self.mq_node.get(size=size, priority=priority)
     else:
         objs = pickle.loads(client_call(addr, self.prefix+'get', 
                                         size, priority))
     
     addr_caches = self.caches.get(addr, [])
     if size == 1 and objs is None and len(addr_caches) > 0:
         return addr_caches.pop(0)
     elif size > 1 and len(objs) == 0 and len(addr_caches) > 0:
         return addr_caches[:size]
     
     return objs