예제 #1
0
class Client():
    """DrQueue client actions"""
    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True

        # list of all available query keys
        self.all_task_query_keys = ['msg_id', 'header', 'content', 'buffers', 'submitted', 'client_uuid', 'engine_uuid', 'started', 'completed', 'resubmitted', 'result_header', 'result_content', 'result_buffers', 'queue', 'pyin', 'pyout', 'pyerr', 'stdout', 'stderr']


    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job['name'] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job['name'])
            return False

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job['owner']

        # set number of retries for each task
        self.lbview.retries = job['retries']

        # depend on another job (it's tasks)
        if ('depend' in job['limits']) and (job['limits']['depend'] != None):
            depend_job = self.query_job_by_name(job['limits']['depend'])
            depend_tasks = self.query_task_list(depend_job['_id'])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task['msg_id'])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job['startframe'] >= 1):
            raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= 1):
            raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= job['startframe']):
            raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.")
            return False
        if job['endframe'] > job['startframe']:
            if not (job['endframe'] - job['startframe'] >= job['blocksize']):
                raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.")
                return False
        if job['endframe'] == job['startframe']:
            if job['blocksize'] != 1:
                raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.")
                return False

        task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize']))
        ar = None
        for x in task_frames:

            # prepare script input
            env_dict = {
            'DRQUEUE_FRAME' : x,
            'DRQUEUE_BLOCKSIZE' : job['blocksize'],
            'DRQUEUE_ENDFRAME' : job['endframe'],
            'DRQUEUE_SCENEFILE' : job['scenefile']
            }

            # log filename
            if job['created_with'] == "DrQueueOnRails":
                # take job directory name
                env_dict['DRQUEUE_LOGFILE'] = job['scenefile'].split("/")[-2] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"
            else:
                # take job name
                env_dict['DRQUEUE_LOGFILE'] = job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"

            # optional elements
            if 'renderdir' in job:
                env_dict['DRQUEUE_RENDERDIR'] = job['renderdir']
            if 'projectdir' in job:
                env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir']
            if 'configdir' in job:
                env_dict['DRQUEUE_CONFIGDIR'] = job['configdir']
            if 'imagefile' in job:
                env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile']
            if 'precommand' in job:
                env_dict['DRQUEUE_PRECOMMAND'] = job['precommand']
            if 'renderer' in job:
                env_dict['DRQUEUE_RENDERER'] = job['renderer']
            if 'fileformat' in job:
                env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat']
            if 'postcommand' in job:
                env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand']
            if 'viewcommand' in job:
                env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand']
            if 'worldfile' in job:
                env_dict['DRQUEUE_WORLDFILE'] = job['worldfile']
            if 'terrainfile' in job:
                env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile']
            if 'composition' in job:
                env_dict['DRQUEUE_COMPOSITION'] = job['composition']
            if 'camera' in job:
                env_dict['DRQUEUE_CAMERA'] = job['camera']
            if 'resx' in job:
                env_dict['DRQUEUE_RESX'] = job['resx']
            if 'resy' in job:
                env_dict['DRQUEUE_RESY'] = job['resy']
            if 'renderpass' in job:
                env_dict['DRQUEUE_RENDERPASS'] = job['renderpass']
            if 'rendertype' in job:
                env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype']
            if 'fileextension' in job:
                env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension']
            if 'stepframe' in job:
                env_dict['DRQUEUE_STEPFRAME'] = job['stepframe']
            if 'custom_bucket' in job:
                env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket']
            if 'bucketsize' in job:
                env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize']
            if 'custom_lod' in job:
                env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod']
            if 'lod' in job:
                env_dict['DRQUEUE_LOD'] = job['lod']
            if 'custom_varyaa' in job:
                env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa']
            if 'varyaa' in job:
                env_dict['DRQUEUE_VARYAA'] = job['varyaa']
            if 'raytrace' in job:
                env_dict['DRQUEUE_RAYTRACE'] = job['raytrace']
            if 'antialias' in job:
                env_dict['DRQUEUE_ANTIALIAS'] = job['antialias']
            if 'custom_bdepth' in job:
                env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth']
            if 'bdepth' in job:
                env_dict['DRQUEUE_BDEPTH'] = job['bdepth']
            if 'custom_zdepth' in job:
                env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth']
            if 'zdepth' in job:
                env_dict['DRQUEUE_ZDEPTH'] = job['zdepth']
            if 'custom_cracks' in job:
                env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks']
            if 'cracks' in job:
                env_dict['DRQUEUE_CRACKS'] = job['cracks']
            if 'custom_quality' in job:
                env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality']
            if 'quality' in job:
                env_dict['DRQUEUE_QUALITY'] = job['quality']
            if 'custom_qfiner' in job:
                env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner']
            if 'qfiner' in job:
                env_dict['DRQUEUE_QFINER'] = job['qfiner']
            if 'custom_smultiplier' in job:
                env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier']
            if 'smultiplier' in job:
                env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier']
            if 'custom_mpcache' in job:
                env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache']
            if 'mpcache' in job:
                env_dict['DRQUEUE_MPCACHE'] = job['mpcache']
            if 'custom_smpolygon' in job:
                env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon']
            if 'smpolygon' in job:
                env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon']
            if 'custom_wh' in job:
                env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh']
            if 'custom_type' in job:
                env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type']
            if 'ctype' in job:
                env_dict['DRQUEUE_CTYPE'] = job['ctype']
            if 'skipframes' in job:
                env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes']

            # set dependencies
            dep_dict = {}
            dep_dict['job_id'] = job_id
            if ('os' in job['limits']) and (job['limits']['os'] != None):
                dep_dict['os_name'] = job['limits']['os']
            if ('minram' in job['limits']) and (job['limits']['minram'] > 0):
                dep_dict['minram'] = job['limits']['minram']
            if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0):
                dep_dict['mincores'] = job['limits']['mincores']
            if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None):
                dep_dict['pool_name'] = job['limits']['pool_name']
            run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict)

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job['renderer'])
            ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()

        # append email task behind last task if requested
        if ('send_email' in job) and (job['send_email'] == True):
            self.lbview.after = ar
            # run email task
            mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients'])
            # wait for pyzmq send to complete communication (avoid race condition)
            mail_ar.wait_for_send()
        return True


    def identify_computer(self, engine_id, cache_time, timeout=15):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db_by_engine_id(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine['created_at'] + cache_time):
            print("DEBUG: Engine %i was found in DB and info is up-to-date." % engine_id)
            return engine
        # store new info
        else:
            if engine != None:
                print("DEBUG: Engine %i was found in DB, but info needs to be updated." % engine_id)
            else:
                print("DEBUG: Engine %i was not found in DB." % engine_id)
            # run command only on specific computer
            try:
                dview = self.ip_client[engine_id]
            except IndexError:
                print("DEBUG: Engine with id %i unknown." % engine_id)
                # delete old entry from database
                DrQueueComputer.delete_from_db_by_engine_id(engine_id)
                print("DEBUG: Engine with id %i deleted from database." % engine_id)
                new_engine = None
            else:
                # run command in async mode
                dview.block = False
                command = "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer()"
                ar = dview.execute(command)
                try:
                    # try to get results & wait until timeout
                    ar.get(timeout)
                except Exception:
                    if engine != None:
                        print("DEBUG: Update request for engine %i timed out. Using old information from DB." % engine_id)
                        new_engine = engine
                    else:
                        print("DEBUG: Information request for engine %i timed out." % engine_id)
                        new_engine = None
                else:
                    # get computer dict from engine namespace
                    new_engine = dview['engine']
                    # set to known engine_id
                    new_engine['engine_id'] = engine_id
                    # set creation time
                    new_engine['created_at'] = int(time.time())
                    # store entry in database
                    DrQueueComputer.store_db(new_engine)
            return new_engine


    def computer_set_pools(self, computer, pool_list):
        """add computer to list of pools"""
        # convert to string
        pool_str = ','.join(pool_list)
        # update environment variable on engine
        dview = self.ip_client[computer['engine_id']]
        dview.block = True
        command = "import os\nos.environ[\"DRQUEUE_POOL\"] = \"" + pool_str + "\""
        dview.execute(command)
        # update database entry
        computer['pools'] = pool_list
        DrQueueComputer.store_db(computer)
        print("DEBUG: Engine " + str(computer['engine_id']) + " added to pools " + pool_str + ".")
        return computer


    def computer_get_pools(self, computer):
        """Return all pool names where computer is member."""
        return computer['pools']


    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar


    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()


    def query_job_by_id(self, job_id):
        """Query job by given id"""
        return DrQueueJob.query_db(job_id)


    def query_job_by_name(self, job_name):
        """Query job by given name"""
        return DrQueueJob.query_job_by_name(job_name)


    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task['completed'] == None:
                left += 1
        return left


    def query_job_finish_time(self, job_id):
        """Query oldest finish time of all tasks."""
        job = self.query_job_by_id(job_id)
        # use requeue time as starting point if available
        if ('requeue_time' in job ) and (job['requeue_time'] != False):
            finish_time = job['requeue_time']
        else:
            finish_time = job['submit_time']
        tasks = self.query_task_list(job_id)
        for task in tasks:
            # look if older finish time exists
            if (task['completed'] != None) and (task['completed'] > finish_time):
                finish_time = task['completed']
        return finish_time


    def get_frame_nr(self, task):
        """Extract value of DRQUEUE_FRAME."""
        if ('buffers' in task) and task['buffers'] != []:
            frame_nr = int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME'])
        else:
            frame_nr = 1
        return frame_nr


    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job. Sort by frame number."""
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys)
        sorted_task_list = sorted(task_list, key=self.get_frame_nr)
        return sorted_task_list


    def query_interrupted_task_list(self, job_id):
        """Query a list of interrupted tasks of certain job. Sort by frame number."""
        job = self.query_job_by_id(job_id)
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys)
        interrupted_task_list = []

        for task in task_list:
            frame_nr = self.get_frame_nr(task)
            print("frame_nr: " + str(frame_nr))
            # log filename
            if job['renderer'] == "blender":
                filesearch = job['scenefile'] + str("%04d" % frame_nr) + ".???"
                found = glob.glob(filesearch)
                # file was found
                if len(found) > 0:
                    outputfile = found[0]
                    print("outputfile: "+ str(outputfile))
                    filesize = os.path.getsize(outputfile)
                    print(filesize)
                    # file exists, but is empty
                    if filesize == 0:
                        interrupted_task_list.append(task)
                # file was not found
                else:
                    outputfile = None
                    print("outputfile: "+ str(outputfile))
                    if (task['completed'] == None) and (task['started'] == None):
                        interrupted_task_list.append(task)
            else:
                raise ValueError("Only Blender renderer supported so far.")

        return interrupted_task_list


    def query_task(self, task_id):
        """Query a single task."""
        task = self.ip_client.db_query({'msg_id' : task_id }, keys=self.all_task_query_keys)[0]
        return task


    def query_computer_list(self):
        """Query a list of all computers."""
        return self.ip_client.ids


    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""

        # disable job
        self.job_disable(job_id)

        tasks = self.query_task_list(job_id)
        tasks_to_stop = []
        for task in tasks:
            print("Task " + task["msg_id"] + ": ")
            if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"):
                print("  finished at " + str(task["completed"]))
            else:
                # get task stats of all computers
                stats = self.ip_client.queue_status('all', True)
                # check if tasks is already running on an engine
                found_on_engine = False
                for key,status in list(stats.items()):
                    if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                        # skip tasks which are already running on an engine
                        print("  not finished yet but already queued to engine. will leave it there.")
                        found_on_engine = True
                        break

                # if a task isn't already queueed/running on an engine, it should be safe to abort it
                if found_on_engine == False:
                    print("  not finished yet. will abort.")
                    tasks_to_stop.append(task['msg_id'])

        if len(tasks_to_stop) > 0:
            try:
                self.ip_client.abort(tasks_to_stop)
            except Exception as e:
                print("ERROR: " + str(e))

        return True


    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""

        # disable job
        self.job_disable(job_id)

        tasks = self.query_task_list(job_id)
        running_engines = []
        tasks_to_stop = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
            tasks_to_stop.append(task['msg_id'])
        # stop all matching tasks at once
        try:
            self.ip_client.abort(tasks_to_stop)
        except Exception as e:
            print("ERROR: " + str(e))

        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)
        return True


    def job_disable(self, job_id):
        """Disable job in database."""
        job = self.query_job_by_id(job_id)
        job['enabled'] = False
        DrQueueJob.update_db(job)
        return True


    def job_enable(self, job_id):
        """Disable job in database."""
        job = self.query_job_by_id(job_id)
        job['enabled'] = True
        DrQueueJob.update_db(job)
        return True


    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_computer_list()
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                self.ip_client.abort(task['msg_id'])
            self.ip_client.purge_results(task['msg_id'])
        # delete job itself
        DrQueueJob.delete_from_db(job_id)
        return True


    def job_continue(self, job_id):
        """Continue stopped job and all of it's tasks"""
        job = self.query_job_by_id(job_id)

        # enable job
        self.job_enable(job_id)

        tasks = self.query_task_list(job_id)
        tasks_to_resubmit = []
        for task in tasks:
            print("Task " + task["msg_id"] + ": ")
            if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"):
                print("  finished at " + str(task["completed"]))
            else:
                print("  not finished yet. will resubmit.")
                tasks_to_resubmit.append(task["msg_id"])

        if len(tasks_to_resubmit) > 0:

            # resubmit all matching msg_ids at once
            try:
                async_results = self.ip_client.resubmit(tasks_to_resubmit)
            except Exception as e:
                print("ERROR: " + str(e))

            # IPython seems to give out new msg_ids instead of re-using the old ones
            for msg_id in async_results.msg_ids:
                print("got new msg_id: " + msg_id)

            # delete old tasks which now have a resubmitted clone
            try:
                self.ip_client.purge_results(tasks_to_resubmit)
            except Exception as e:
                print("ERROR: " + str(e))

        return True


    def job_rerun(self, job_id):
        """Run all tasks of job another time"""
        job = self.query_job_by_id(job_id)

        # enable job
        job['enabled'] = True
        # set resubmit time
        job['requeue_time'] = datetime.datetime.now()
        DrQueueJob.update_db(job)

        tasks = self.query_task_list(job_id)
        tasks_to_resubmit = []
        # get all msg_ids of job
        for task in tasks:
            tasks_to_resubmit.append(task["msg_id"])

        # resubmit all msg_ids at once
        try:
            async_results = self.ip_client.resubmit(tasks_to_resubmit)
        except Exception as e:
            print("ERROR: " + str(e))

        # IPython seems to give out new msg_ids instead of re-using the old ones
        for msg_id in async_results.msg_ids:
            print("got new msg_id: " + msg_id)

        # delete old tasks which now have a resubmitted clone
        try:
            self.ip_client.purge_results(tasks_to_resubmit)
        except Exception as e:
            print("ERROR: " + str(e))

        # kickstart all computers
        running_engines = []
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)

        return True


    def task_rerun(self, task_id):
        """Run task another time"""
        task = self.query_task(task_id)

        #print(task)

        # enable job
        #job['enabled'] = True
        # set resubmit time
        #job['requeue_time'] = datetime.datetime.now()
        #DrQueueJob.update_db(job)

        # resubmit msg_id of task
        try:
            async_results = self.ip_client.resubmit(task["msg_id"])
        except Exception as e:
            print("ERROR: " + str(e))

        # IPython seems to give out new msg_ids instead of re-using the old ones
        for msg_id in async_results.msg_ids:
            print("got new msg_id: " + msg_id)

        # delete old tasks which now have a resubmitted clone
        try:
            self.ip_client.purge_results(task["msg_id"])
        except Exception as e:
            print("ERROR: " + str(e))

        # kickstart all computers
        running_engines = []
        stats = self.ip_client.queue_status('all', True)
        # check if tasks is already running on an engine
        for key,status in list(stats.items()):
            if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                running_engines.append(key)
        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)

        return True


    def job_rerun_interrupted_tasks(self, job_id):
        """Run interrupted tasks of job another time"""
        job = self.query_job_by_id(job_id)

        # enable job
        job['enabled'] = True
        # set resubmit time
        job['requeue_time'] = datetime.datetime.now()
        DrQueueJob.update_db(job)

        tasks = self.query_interrupted_task_list(job_id)

        if len(tasks) == 0:
            return True

        tasks_to_resubmit = []
        # get all msg_ids of job
        for task in tasks:
            tasks_to_resubmit.append(task["msg_id"])

        # resubmit all msg_ids at once
        try:
            async_results = self.ip_client.resubmit(tasks_to_resubmit)
        except Exception as e:
            print("ERROR: " + str(e))

        # IPython seems to give out new msg_ids instead of re-using the old ones
        for msg_id in async_results.msg_ids:
            print("got new msg_id: " + msg_id)

        # delete old tasks which now have a resubmitted clone
        try:
            self.ip_client.purge_results(tasks_to_resubmit)
        except Exception as e:
            print("ERROR: " + str(e))

        # kickstart all computers
        running_engines = []
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)

        return True


    def job_status(self, job_id):
        """Return status string of job"""
        tasks = self.query_task_list(job_id)
        status = None
        status_pending = 0
        status_ok = 0
        status_aborted = 0
        status_resubmitted = 0
        status_error = 0
        status_unknown = 0
        for task in tasks:
            # look for pending tasks
            if task['completed'] == None:
                status_pending += 1
            else:
                if 'result_content' in list(task.keys()):
                    result_content = task['result_content']
                    # look for done tasks
                    if ('status' in list(result_content.keys())) and (result_content['status'] == "ok"):
                        status_ok += 1
                    # look for aborted tasks
                    elif ('status' in list(result_content.keys())) and (result_content['status'] == "aborted"):
                        status_aborted += 1
                    # look for done tasks
                    elif ('status' in list(result_content.keys())) and (result_content['status'] == "resubmitted"):
                        status_resubmitted += 1
                    # look for tasks with error
                    elif ('status' in list(result_content.keys())) and (result_content['status'] == "error"):
                        status_error += 1
                    else:
                        status_unknown += 1
        # if at least 1 task is ok, job status is ok
        if status_ok > 0:
            status = "ok"
        # if at least 1 task has unknown status, job status is unknown
        if status_unknown > 0:
            status = "unknown"
        # if at least 1 task is pending, job status is pending
        if status_pending > 0:
            status = "pending"
        # if at least 1 task is aborted, job status is aborted
        if status_aborted > 0:
            status = "aborted"
        # if at least 1 task has an error, job status is error
        if status_error > 0:
            status = "error"
        return status


    def job_estimated_finish_time(self, job_id):
        """Calculate estimated finish time of job."""
        tasks = self.query_task_list(job_id)
        spent_times = []
        # get spent time for each finished task
        for task in tasks:
            if task['completed'] != None:
                if 'result_header' in list(task.keys()):
                    result_header = task['result_header']
                    if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"):
                        timediff = task['completed'] - task['started']
                        spent_times.append(timediff)
        if len(spent_times) > 0:
            # calculate sum of spent time
            sum_times = datetime.timedelta(0)
            for spent in spent_times:
                sum_times += spent
            # calcutate mean time for a single task
            sum_times_secs = sum_times.days * 86400 + sum_times.seconds
            meantime_secs = sum_times_secs / len(spent_times)
            meantime = datetime.timedelta(0, meantime_secs)
            # calculate estimated time left
            tasks_left = len(tasks) - len(spent_times)
            time_left = tasks_left * meantime
            # query job object
            job = self.query_job_by_id(job_id)
            # look if all tasks are already done
            if self.query_job_tasks_left(job_id) == 0:
                finish_time = self.query_job_finish_time(job_id)
            else:
                # calculate estimated finish time, use requeue time if available
                if ('requeue_time' in job ) and (job['requeue_time'] != False):
                    finish_time = job['requeue_time'] + time_left
                else:
                    finish_time = job['submit_time'] + time_left
        else:
            meantime = "unknown"
            time_left = "unknown"
            finish_time = "unknown"
        return meantime, time_left, finish_time


    def engine_stop(self, engine_id):
        """Stop a specific engine"""
        # delete computer information in db
        DrQueueComputer.delete_from_db_by_engine_id(engine_id)
        # we stop the engine
        try:
            self.ip_client.shutdown(engine_id, False, False, True)
        except Exception:
            return False
        return True
예제 #2
0
            yield chunk
            del chunk[:]
        chunk.append(line)
    yield chunk


for f in namefiles:
    print f
    total_chunks = 0
    full_path = namefile_path + "/" + f
    full_output_path = full_path + ".namestd"
    output_conn = open(full_output_path, "wt")
    output_writer = csv.DictWriter(output_conn, fieldnames=fieldnames)
    with open(full_path, "rt") as namefile:
        reader = csv.DictReader(namefile, fieldnames=fieldnames)
        for process_chunk in gen_chunks(reader, chunksize=block_size):
            t0 = time.time()
            out = clean_wrapper.map(process_chunk)
            output_writer.writerows(out)
            t1 = time.time()
            total_chunks += 1
            del out[:]
            print total_chunks, total_chunks * block_size, (t1 - t0) / block_size
            if total_chunks % 10 == 0 and total_chunks > 0:
                ## Clean out cached objects on the clients
                rc.purge_results(targets=rc.ids)
                dview.results.clear()
                rc.results.clear()
                gc.collect()
    output_conn.close()
예제 #3
0
class Client():
    """DrQueue client actions"""
    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True


    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job['name'] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job['name'])
            return False

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job['owner']

        # set number of retries for each task
        self.lbview.retries = job['retries']

        # depend on another job (it's tasks)
        if ('depend' in job['limits']) and (job['limits']['depend'] != None):
            depend_job = self.query_job_by_name(job['limits']['depend'])
            depend_tasks = self.query_task_list(depend_job['_id'])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task['msg_id'])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job['startframe'] >= 1):
            raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= 1):
            raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= job['startframe']):
            raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.")
            return False
        if job['endframe'] > job['startframe']:
            if not (job['endframe'] - job['startframe'] >= job['blocksize']):
                raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.")
                return False
        if job['endframe'] == job['startframe']:
            if job['blocksize'] != 1:
                raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.")
                return False

        task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize']))
        ar = None
        for x in task_frames:
            # prepare script input
            env_dict = {
            'DRQUEUE_FRAME' : x,
            'DRQUEUE_BLOCKSIZE' : job['blocksize'],
            'DRQUEUE_ENDFRAME' : job['endframe'],
            'DRQUEUE_SCENEFILE' : job['scenefile'],
            'DRQUEUE_LOGFILE' : job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"
            }

            # optional elements
            if 'renderdir' in job:
                env_dict['DRQUEUE_RENDERDIR'] = job['renderdir']
            if 'projectdir' in job:
                env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir']
            if 'configdir' in job:
                env_dict['DRQUEUE_CONFIGDIR'] = job['configdir']
            if 'imagefile' in job:
                env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile']
            if 'precommand' in job:
                env_dict['DRQUEUE_PRECOMMAND'] = job['precommand']
            if 'renderer' in job:
                env_dict['DRQUEUE_RENDERER'] = job['renderer']
            if 'fileformat' in job:
                env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat']
            if 'postcommand' in job:
                env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand']
            if 'viewcommand' in job:
                env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand']
            if 'worldfile' in job:
                env_dict['DRQUEUE_WORLDFILE'] = job['worldfile']
            if 'terrainfile' in job:
                env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile']
            if 'composition' in job:
                env_dict['DRQUEUE_COMPOSITION'] = job['composition']
            if 'camera' in job:
                env_dict['DRQUEUE_CAMERA'] = job['camera']
            if 'resx' in job:
                env_dict['DRQUEUE_RESX'] = job['resx']
            if 'resy' in job:
                env_dict['DRQUEUE_RESY'] = job['resy']
            if 'renderpass' in job:
                env_dict['DRQUEUE_RENDERPASS'] = job['renderpass']
            if 'rendertype' in job:
                env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype']
            if 'fileextension' in job:
                env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension']
            if 'stepframe' in job:
                env_dict['DRQUEUE_STEPFRAME'] = job['stepframe']
            if 'custom_bucket' in job:
                env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket']
            if 'bucketsize' in job:
                env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize']
            if 'custom_lod' in job:
                env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod']
            if 'lod' in job:
                env_dict['DRQUEUE_LOD'] = job['lod']
            if 'custom_varyaa' in job:
                env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa']
            if 'varyaa' in job:
                env_dict['DRQUEUE_VARYAA'] = job['varyaa']
            if 'raytrace' in job:
                env_dict['DRQUEUE_RAYTRACE'] = job['raytrace']
            if 'antialias' in job:
                env_dict['DRQUEUE_ANTIALIAS'] = job['antialias']
            if 'custom_bdepth' in job:
                env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth']
            if 'bdepth' in job:
                env_dict['DRQUEUE_BDEPTH'] = job['bdepth']
            if 'custom_zdepth' in job:
                env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth']
            if 'zdepth' in job:
                env_dict['DRQUEUE_ZDEPTH'] = job['zdepth']
            if 'custom_cracks' in job:
                env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks']
            if 'cracks' in job:
                env_dict['DRQUEUE_CRACKS'] = job['cracks']
            if 'custom_quality' in job:
                env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality']
            if 'quality' in job:
                env_dict['DRQUEUE_QUALITY'] = job['quality']
            if 'custom_qfiner' in job:
                env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner']
            if 'qfiner' in job:
                env_dict['DRQUEUE_QFINER'] = job['qfiner']
            if 'custom_smultiplier' in job:
                env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier']
            if 'smultiplier' in job:
                env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier']
            if 'custom_mpcache' in job:
                env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache']
            if 'mpcache' in job:
                env_dict['DRQUEUE_MPCACHE'] = job['mpcache']
            if 'custom_smpolygon' in job:
                env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon']
            if 'smpolygon' in job:
                env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon']
            if 'custom_wh' in job:
                env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh']
            if 'custom_type' in job:
                env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type']
            if 'ctype' in job:
                env_dict['DRQUEUE_CTYPE'] = job['ctype']
            if 'skipframes' in job:
                env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes']

            # set dependencies
            dep_dict = {}
            if ('os' in job['limits']) and (job['limits']['os'] != None):
                dep_dict['os_name'] = job['limits']['os']
            if ('minram' in job['limits']) and (job['limits']['minram'] > 0):
                dep_dict['minram'] = job['limits']['minram']
            if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0):
                dep_dict['mincores'] = job['limits']['mincores']
            if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None):
                dep_dict['pool_name'] = job['limits']['pool_name']
            run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict)

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job['renderer'])
            ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()

        # append email task behind last task if requested
        if ('send_email' in job) and (job['send_email'] == True):
            self.lbview.after = ar
            # run email task
            mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients'])
            # wait for pyzmq send to complete communication (avoid race condition)
            mail_ar.wait_for_send()
        return True


    def identify_computer(self, engine_id, cache_time):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine['date'] + cache_time):
            print("DEBUG: Engine %i was found in DB" % engine_id)
        # store new info
        else:
            print("DEBUG: Engine %i was not found in DB" % engine_id)
            # run command only on specific computer
            dview = self.ip_client[engine_id]
            dview.block = True
            dview.execute("import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")")
            engine = dview['engine']
            engine['date'] = int(time.time())
            DrQueueComputer.store_db(engine)
        return engine


    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar


    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()


    def query_running_job_list(self):
        """Query a list of all running jobs"""
        jobs = DrQueueJob.query_job_list()
        running_jobs = []
        for job in jobs:
            if self.query_job_tasks_left(job['_id']) > 0:
                running_jobs.append(job)
        return running_jobs


    def query_jobname(self, task_id):
        """Query jobname from task id"""
        data = self.ip_client.db_query({"msg_id" : task_id})
        job_id = data[0]['header']['session']
        job = DrQueueJob.query_db(job_id)
        return job.name


    def query_job(self, job_id):
        """Query job from id"""
        return DrQueueJob.query_db(job_id)


    def query_job_by_name(self, job_name):
        """Query job from name"""
        return DrQueueJob.query_job_by_name(job_name)


    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task['completed'] == None:
                left += 1
        return left


    def query_job_finish_time(self, job_id):
        """Query oldest finish time of all tasks."""
        job = self.query_job(job_id)
        # use requeue time as starting point if available
        if ('requeue_time' in job ) and (job['requeue_time'] != False):
            finish_time = job['requeue_time']
        else:
            finish_time = job['submit_time']
        tasks = self.query_task_list(job_id)
        for task in tasks:
            # look if older finish time exists
            if (task['completed'] != None) and (task['completed'] > finish_time):
                finish_time = task['completed']
        return finish_time


    def get_frame_nr(self, task):
        """Extract value of DRQUEUE_FRAME."""
        return int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME'])


    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job"""
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)})
        sorted_task_list = sorted(task_list, key=self.get_frame_nr)
        return sorted_task_list


    def query_task(self, task_id):
        """Query a single task"""
        task = self.ip_client.db_query({'msg_id' : task_id })[0]
        return task


    def query_engine_list(self):
        """Query a list of all engines"""
        return self.ip_client.ids


    def query_engines_of_pool(self, pool_name):
        """Return available engines of certain pool."""
        pool_computers = self.ip_client.ids
        if pool_name != None:
            computers = DrQueueComputerPool.query_pool_members(pool_name)
            if computers == None:
                raise ValueError("Pool \"%s\" is not existing!" % pool_name)
                return False
            for comp in pool_computers:
                if not comp in computers:
                    pool_computers.remove(comp)
            if pool_computers == []:
                raise ValueError("No computer of pool %s is available!" % pool_name)
                return False
            print("DEBUG: matching pool: " + pool_name)
            print(pool_computers)
        return pool_computers


    def query_engines_of_os(self, os_name):
        """Return only engines running certain OS."""
        # run job only on matching os
        matching_os = self.ip_client.ids
        if os_name != None:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                # os string has to contain os_name
                if not os_name in engine['os']:
                    matching_os.remove(engine_id)
            print("DEBUG: matching os: " + os_name)
            print(matching_os)
        return matching_os


    def query_engines_with_minram(self, minram):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching minram
        matching_minram = self.ip_client.ids
        if minram > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine['memory'] < minram:
                    matching_minram.remove(engine_id)
            print("DEBUG: matching minram: " + str(minram))
            print(matching_minram)
        return matching_minram


    def query_engines_with_mincores(self, mincores):
        """Return only engines with at least mincores CPU cores."""
        # run job only on matching mincores
        matching_mincores = self.ip_client.ids
        if mincores > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine['ncorescpu'] * engine['ncpus'] < mincores:
                    matching_mincores.remove(engine_id)
            print("DEBUG: matching mincores: " + str(mincores))
            print(matching_mincores)
        return matching_mincores


    def match_all_limits(self, os_list, minram_list, mincores_list, pool_list):
        """Match all limits for job."""
        tmp_list = []
        # build list with all list members
        tmp_list.extend(os_list)
        tmp_list.extend(minram_list)
        tmp_list.extend(mincores_list)
        tmp_list.extend(pool_list)
        # make entries unique
        tmp_list = set(tmp_list)
        tmp_list = list(tmp_list)
        matching_limits = []
        for entry in tmp_list:
            # look if entry is in all lists
            if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list):
                matching_limits.append(entry)
            else:
                print("DEBUG: %i isn't matching limits" % entry)
        print("DEBUG: matching limits:")
        print(matching_limits)
        if len(matching_limits) == 0:
            message = "No engine meets the requirements."
            print(message)
            raise Exception(message)
        elif len(matching_limits) > 0:
            # only run on matching engines
            self.lbview = self.ip_client.load_balanced_view(matching_limits)
        else:
            self.lbview = self.ip_client.load_balanced_view()


    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""
        tasks = self.query_task_list(job_id)
        # abort all queued tasks
        for task in tasks:
            self.ip_client.abort(task['msg_id'])
        return True


    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""
        tasks = self.query_task_list(job_id)
        running_engines = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
            self.ip_client.abort(task['msg_id'])
        # restart all engines which still run a task
        running_engines = set(running_engines)
        return True


    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_engine_list()
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                self.ip_client.abort(task['msg_id'])
            self.ip_client.purge_results(task['msg_id'])
        # delete job itself
        DrQueueJob.delete_from_db(job_id)
        return True


    def task_continue(self, task_id):
        """Continue aborted or failed task"""
        task = self.query_task(task_id)
        # check if action is needed
        if (task['completed'] != None) and ((task['result_header']['status'] == "error") or (task['result_header']['status'] == "aborted")):
            self.task_requeue(task_id)
        return True


    def task_requeue(self, task_id):
        """Requeue task"""
        self.ip_client.resubmit(task_id)
        print("requeuing %s" % task_id)
        return True


    def job_continue(self, job_id):
        """Continue stopped job and all of it's tasks"""
        job = self.query_job(job_id)
        tasks = self.query_task_list(job_id)
        # continue tasks
        for task in tasks:
            self.task_continue(task['msg_id'])
        return True


    def job_rerun(self, job_id):
        """Run all tasks of job another time"""
        job = self.query_job(job_id)
        tasks = self.query_task_list(job_id)
        # rerun tasks
        for task in tasks:
            self.task_requeue(task['msg_id'])
        # set resubmit time
        job['requeue_time'] = datetime.datetime.now()
        DrQueueJob.update_db(job)
        return True


    def job_status(self, job_id):
        """Return status string of job"""
        tasks = self.query_task_list(job_id)
        status = None
        status_pending = 0
        status_ok = 0
        status_aborted = 0
        status_resubmitted = 0
        status_error = 0
        status_unknown = 0
        for task in tasks:
            # look for pending tasks
            if task['completed'] == None:
                status_pending += 1
            else:
                if 'result_header' in list(task.keys()):
                    result_header = task['result_header']
                    # look for done tasks
                    if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"):
                        status_ok += 1
                    # look for aborted tasks
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "aborted"):
                        status_aborted += 1
                    # look for done tasks
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "resubmitted"):
                        status_resubmitted += 1
                    # look for tasks with error
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "error"):
                        status_error += 1
                    else:
                        status_unknown += 1
        # if at least 1 task is ok, job status is ok
        if status_ok > 0:
            status = "ok"
        # if at least 1 task is pending, job status is pending
        if status_pending > 0:
            status = "pending"
        # if at least 1 task is aborted, job status is aborted
        if status_aborted > 0:
            status = "aborted"
        # if at least 1 task has an error, job status is error
        if status_error > 0:
            status = "error"
        return status


    def job_estimated_finish_time(self, job_id):
        """Calculate estimated finish time of job."""
        tasks = self.query_task_list(job_id)
        spent_times = []
        # get spent time for each finished task
        for task in tasks:
            if task['completed'] != None:
                if 'result_header' in list(task.keys()):
                    result_header = task['result_header']
                    if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"):
                        timediff = task['completed'] - task['started']
                        spent_times.append(timediff)
        if len(spent_times) > 0:
            # calculate sum of spent time
            sum_times = datetime.timedelta(0)
            for spent in spent_times:
                sum_times += spent
            # calcutate mean time for a single task
            meantime = sum_times / len(spent_times)
            # calculate estimated time left
            tasks_left = len(tasks) - len(spent_times)
            time_left = tasks_left * meantime
            # query job object
            job = self.query_job(job_id)
            # look if all tasks are already done
            if self.query_job_tasks_left(job_id) == 0:
                finish_time = self.query_job_finish_time(job_id)
            else:
                # calculate estimated finish time, use requeue time if available
                if ('requeue_time' in job ) and (job['requeue_time'] != False):
                    finish_time = job['requeue_time'] + time_left
                else:
                    finish_time = job['submit_time'] + time_left
        else:
            meantime = "unknown"
            time_left = "unknown"
            finish_time = "unknown"
        return meantime, time_left, finish_time


    def engine_stop(self, engine_id):
        """Stop a specific engine"""
        # delete computer information in db
        DrQueueComputer.delete_from_db(engine_id)
        # shutdown computer
        self.ip_client.shutdown(engine_id)
        return True


    def engine_restart(self, engine_id):
        """Restart a specific engine"""
        self.ip_client.shutdown(engine_id, True, False, True)
        return True
예제 #4
0
파일: jobs.py 프로젝트: revoltek/mpip
    def run(self):
        if self.session.get_client() == None:
            self.mylog.error("Not connected to a cluster.")
            return False

        # workaround for Ipython bug which makes everything slow,
        # create a new client, use it and delete it
        c = Client(profile='ssh')

        jcmd = self.session.opts.get_opt('jcmd')
        if jcmd == 'purge':
            num = 0
            query = c.db_query({'completed':{'$ne' : None }},['msg_id'])
            for q in query:
                result = c.get_result(q['msg_id']).get()
                # filter on SB, node, task
                if self._check_result(result):
                    num += 1
                    c.purge_results(q['msg_id'])
            mylogger.userinfo(self.mylog, str(num)+" cluster's hub results deleted.")

        elif jcmd == 'list':
            num = 0
            # query the hub DB for all the finished tasks and get IDs
            query = c.db_query({'completed':{'$ne' : None }},['msg_id','completed','started'])
            # search for interesting results and print them
            for q in query:
                result = c.get_result(q['msg_id']).get()

                # filter on SB, node, task
                if self._check_result(result):
                    # skip results without error if wanted
                    if self.session.opts.get_opt('onlyerr') and result['err'] == '': continue
                    num += 1
                    header = {'Task' : result['task'], 'Node' : result['node'],\
                          'SB' : result['SB'], \
                          'Completed' : q['completed'].replace(microsecond=0), \
                          'Started' : q['started'].replace(microsecond=0), \
                          'Exec time': q['completed'].replace(microsecond=0)-q['started'].replace(microsecond=0)}
                    data = {'Std Output': result['out'], 'Std Error': result['err'], \
                          'Command':result['command']}
                    print_jobs(header, data, self.session.opts.get_opt('lines'))
            mylogger.userinfo(self.mylog, str(num)+" processes listed.")

        elif jcmd == 'running':
            num_r = 0
            num_q = 0
            # TODO: it should be "Started" not "submitted", unfortunately ipython does not set it
            query = c.db_query({'completed': None},['buffers','engine_uuid','submitted'])
            for q in query:
                # unpack the buffer of the sent jobs to obtain the arguments
                null, com, args = unpack_apply_message(q['buffers'])
                # filter on SB, node, task
                if self._check_result({'node':args['node'],'SB':args['SB'],'task':args['task']}):

                    if q['engine_uuid'] == None:
                        if self.session.opts.get_opt('queue') == False: continue
                        q['msg_id'] = q['msg_id']+" (queue)"
                        num_q += 1
                    else:
                        num_r += 1

                    header = {'Msg_id' : q['msg_id'], 'Task' : args['task'], 'Node' : args['node'], 'SB' : args['SB'], \
                       'Started' : q['submitted'].replace(microsecond=0), \
                       'Extime': datetime.datetime.now().replace(microsecond=0) - q['submitted'].replace(microsecond=0)}

                    data = {'Command': com[0]}
                    print_jobs(header, data, self.session.opts.get_opt('lines'))


            mylogger.userinfo(self.mylog, "Processes running: "+str(num_r)+". In queue: "+str(num_q)+".")
        
        elif jcmd == 'kill':
            print "TBI"

        #TODO: add a resubmit option to resubmit all tasks that failed http://ipython.org/ipython-doc/stable/parallel/parallel_task.html

        del c
예제 #5
0
class Load_balanced_view(object):
    """class that implements the initialisation of a ipython parallel
    load_ballance_view performing some check. It also execute allows to execute
    some python command on all engines, submit pieces of code, and print a
    progress log. A cleanup function provided
    """
    def __init__(self, client=None, profile='default'):
        """
        Start a load_balanced_view from IPython.parallel.  If a client is not
        given, checks are run to see if ipcluster exists and engines are
        running If none of this happen the computation is switched to single
        serial.  Otherwise the client and the load balanced view are
        initialised.

        Parameters
        ----------
        *client*: an IPython parallel client
            if *None* a new object created
        *profile*: Ipython profile. Used if *client* is not *None*
        """

        self.do_parallel = True  #everything ok
        try:  #try to import Client
            if client != None:
                self.c = client
            else:
                self.c = Client(profile=profile)
            self.engines_id = self.c.ids  #get the id of the engines
            self.dview = self.c[:]
            self.lbview = self.c.load_balanced_view()  #load view
        except ImportError:  #if the import fails
            print("""Ipython.parallel.Client cannot be imported.\
 Make sure to have Ipython version > 0.11 installed.""")
            self.do_parallel = self._continue_serial()
        except error.NoEnginesRegistered:  #if Ipython is not present
            print("""The Ipython cluster has not been started start it\
 before executing the code. e.g. 'ipcluster start --n=4'.""")
            self.do_parallel = self._continue_serial()

    def _continue_serial(self):
        """asks if the user wants to continue in serial mode or quit"""
        import io_custom as sio
        message = "Do you want to continue in serial mode"
        if (sio.yes_or_not(message, 'y')):
            return False  #disable the paraller computation
        else:
            exit()

    def is_parallel_enabled(self):
        """Returns *True* if the initialization went fine, othewise *False*
        output
        ------
        *parallel*: bool
            *True* if the paraller environment has been set up without
            problems, *False* otherwise
        """
        return self.do_parallel

    def exec_on_engine(self, code, block=True):
        """
        Execute the given code on all engines 
        
        Parameters
        ----------
        to_execute: string or list of strings 
            command(s) to execute on all the nodes. Thought for short tasks,
            like importing modules
        block: bool
            whether or not to wait until done to return. default: True
        """
        #Six: Python 2 and 3 Compatibility Library
        from six import string_types  #appropriate string type
        if isinstance(code, string_types):  # if it's a string
            code = [
                code,
            ]  # convert to list
        # execute the required commands
        # (better to do in block mode, avoids errors if command is slow)
        for te in code:
            try:
                self.dview.execute(te, block=block)
            except error.CompositeError as e:  # if an error occurs, print a single one, not one per engine
                e.raise_exception()

    def push(self, variables):
        """
        wrapper around dview.push(dict)
        push a list of variables to the ipython engines
        Parameters
        ----------
        variables: dictionary
            dictionary of variables
        """
        self.dview.push(variables)

    def apply(self, f, *args, **kwargs):
        """
        wrapper around 'lview.apply(self, f, *args, **kwargs)'

        Docstring:
            calls f(*args, **kwargs) on remote engines, returning the result.

        This method sets all apply flags via this View's attributes.

        if self.block is False:
            returns AsyncResult
        else:
            returns actual result of f(*args, **kwargs)
        """
        return self.lbview.apply(f, *args, **kwargs)

    def get_queue_status(self):
        """
        get the status of the queue
        """
        return self.lbview.queue_status()

    def advancement_jobs(self, jobs, update=30, init_status=None):
        """Print the advancement of the jobs in the queue.  
        This functions returns when all jobs are finished

        Parameters
        ----------
        jobs: list of AsyncResult objects
            list of jobs submitted to the task scheduler
        update: float or int
            update the status every 'update' seconds. If negative, only the initial and
            final status are written
        init_status: dict
            dictionary returned from load_balanced_view.queue_status(). If given the
            number of jobs per processors is returned
        """

        import numpy as np
        tot_jobs = len(jobs)
        print("Starting {0} jobs using {1} engines".format(
            tot_jobs, len(self.engines_id)))  #start message
        if (update > 0):  #if: advancement status
            import io_custom as sio
            while not self.wait(jobs=jobs, timeout=update):
                status = self.get_queue_status()
                #get the number of running jobs
                totrunning = np.sum(
                    [status[i]['tasks'] for i in self.engines_id])
                tot_torun = status['unassigned']
                already_run = tot_jobs - (totrunning + tot_torun)
                percentage_run = already_run / float(tot_jobs)
                #print the status message
                message = """{0:.1%} done. {1} finished {2} running, {3} pending.""".format(
                    percentage_run, already_run, totrunning, tot_torun)
                sio.printer(message)
            #end while not lbview.wait( ... )
            sio.printer("Finished")
        else:  #else if: advancement status
            self.wait(jobs=jobs)  #wait until it finishes
            print("Finished")
        #end if: advancement status

        #if details about the jobs per processor are wanted
        print("")
        if (init_status is not None):
            final_status = self.get_queue_status()  #get the final status
            print("{0:<5}: # processes".format("id"))
            for i in self.engines_id:
                print("{0:<5}: {1}".format(
                    i, final_status[i]['completed'] -
                    init_status[i]['completed']))
        # end def advancement_jobs( ... )

    def wait(self, jobs=None, timeout=-1):
        """wrapper around lview.wait(self, jobs=None, timeout=-1)
        waits on one or more `jobs`, for up to `timeout` seconds.

        Parameters
        ----------

        jobs : int, str, or list of ints and/or strs, or one or more AsyncResult objects
            ints are indices to self.history
            strs are msg_ids
            default: wait on all outstanding messages
        timeout : float
            a time in seconds, after which to give up.
            default is -1, which means no timeout

        Returns
        -------

        True : when all msg_ids are done
        False : timeout reached, some msg_ids still outstanding
        """
        return self.lbview.wait(jobs=jobs, timeout=timeout)

    def clear_cache(self):
        """
        clear the cache of the parallel computation to avoid memory overload.
        from: http://mail.scipy.org/pipermail/ipython-user/2012-December/011874.html
        check if something like this will be implemented eventually
        """
        self.c.purge_results('all')  #clears controller
        self.c.results.clear()
        self.c.metadata.clear()
        self.dview.results.clear()
        self.lbview.results.clear()
        assert not self.c.outstanding, "don't clear history when tasks are outstanding"
        self.c.history = []
        self.dview.history = []
        self.lbview.history = []
예제 #6
0
            del chunk[:]
        chunk.append(line)
    yield chunk


for f in namefiles:
    print f
    total_chunks = 0
    full_path = namefile_path + '/' + f
    full_output_path = full_path + '.namestd'
    output_conn = open(full_output_path, 'wt')
    output_writer = csv.DictWriter(output_conn, fieldnames=fieldnames)
    with open(full_path, 'rt') as namefile:
        reader = csv.DictReader(namefile, fieldnames=fieldnames)
        for process_chunk in gen_chunks(reader, chunksize=block_size):
            t0 = time.time()
            out = clean_wrapper.map(process_chunk)
            output_writer.writerows(out)
            t1 = time.time()
            total_chunks += 1
            del out[:]
            print total_chunks, total_chunks * block_size, (t1 -
                                                            t0) / block_size
            if total_chunks % 10 == 0 and total_chunks > 0:
                ## Clean out cached objects on the clients
                rc.purge_results(targets=rc.ids)
                dview.results.clear()
                rc.results.clear()
                gc.collect()
    output_conn.close()
예제 #7
0
class Client:
    """DrQueue client actions"""

    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True

    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job["name"] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job["name"])
            return False

        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])

        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])

        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])

        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])

        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job["owner"]

        # set number of retries for each task
        self.lbview.retries = job["retries"]

        # depend on another job (it's tasks)
        if ("depend" in job["limits"]) and (job["limits"]["depend"] != None):
            depend_job = self.query_job_by_name(job["limits"]["depend"])
            depend_tasks = self.query_task_list(depend_job["_id"])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task["msg_id"])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job["startframe"] >= 1):
            raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.")
            return False
        if not (job["endframe"] >= 1):
            raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.")
            return False
        if not (job["endframe"] >= job["startframe"]):
            raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.")
            return False
        if job["endframe"] > job["startframe"]:
            if not (job["endframe"] - job["startframe"] >= job["blocksize"]):
                raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.")
                return False
        if job["endframe"] == job["startframe"]:
            if job["blocksize"] != 1:
                raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.")
                return False

        task_frames = range(job["startframe"], job["endframe"] + 1, job["blocksize"])
        for x in task_frames:
            # prepare script input
            env_dict = {
                "DRQUEUE_FRAME": x,
                "DRQUEUE_BLOCKSIZE": job["blocksize"],
                "DRQUEUE_ENDFRAME": job["endframe"],
                "DRQUEUE_SCENEFILE": job["scenefile"],
                "DRQUEUE_LOGFILE": job["name"] + "-" + str(x) + "_" + str(x + job["blocksize"] - 1) + ".log",
            }

            # optional elements
            if "renderdir" in job:
                env_dict["DRQUEUE_RENDERDIR"] = job["renderdir"]
            if "projectdir" in job:
                env_dict["DRQUEUE_PROJECTDIR"] = job["projectdir"]
            if "configdir" in job:
                env_dict["DRQUEUE_CONFIGDIR"] = job["configdir"]
            if "imagefile" in job:
                env_dict["DRQUEUE_IMAGEFILE"] = job["imagefile"]
            if "precommand" in job:
                env_dict["DRQUEUE_PRECOMMAND"] = job["precommand"]
            if "renderer" in job:
                env_dict["DRQUEUE_RENDERER"] = job["renderer"]
            if "fileformat" in job:
                env_dict["DRQUEUE_FILEFORMAT"] = job["fileformat"]
            if "postcommand" in job:
                env_dict["DRQUEUE_POSTCOMMAND"] = job["postcommand"]
            if "viewcommand" in job:
                env_dict["DRQUEUE_VIEWCOMMAND"] = job["viewcommand"]
            if "worldfile" in job:
                env_dict["DRQUEUE_WORLDFILE"] = job["worldfile"]
            if "terrainfile" in job:
                env_dict["DRQUEUE_TERRAINFILE"] = job["terrainfile"]
            if "composition" in job:
                env_dict["DRQUEUE_COMPOSITION"] = job["composition"]
            if "camera" in job:
                env_dict["DRQUEUE_CAMERA"] = job["camera"]
            if "resx" in job:
                env_dict["DRQUEUE_RESX"] = job["resx"]
            if "resy" in job:
                env_dict["DRQUEUE_RESY"] = job["resy"]
            if "renderpass" in job:
                env_dict["DRQUEUE_RENDERPASS"] = job["renderpass"]
            if "rendertype" in job:
                env_dict["DRQUEUE_RENDERTYPE"] = job["rendertype"]
            if "fileextension" in job:
                env_dict["DRQUEUE_FILEEXTENSION"] = job["fileextension"]
            if "stepframe" in job:
                env_dict["DRQUEUE_STEPFRAME"] = job["stepframe"]
            if "custom_bucket" in job:
                env_dict["DRQUEUE_CUSTOM_BUCKET"] = job["custom_bucket"]
            if "bucketsize" in job:
                env_dict["DRQUEUE_BUCKETSIZE"] = job["bucketsize"]
            if "custom_lod" in job:
                env_dict["DRQUEUE_CUSTOM_LOD"] = job["custom_lod"]
            if "lod" in job:
                env_dict["DRQUEUE_LOD"] = job["lod"]
            if "custom_varyaa" in job:
                env_dict["DRQUEUE_CUSTOM_VARYAA"] = job["custom_varyaa"]
            if "varyaa" in job:
                env_dict["DRQUEUE_VARYAA"] = job["varyaa"]
            if "raytrace" in job:
                env_dict["DRQUEUE_RAYTRACE"] = job["raytrace"]
            if "antialias" in job:
                env_dict["DRQUEUE_ANTIALIAS"] = job["antialias"]
            if "custom_bdepth" in job:
                env_dict["DRQUEUE_CUSTOM_BDEPTH"] = job["custom_bdepth"]
            if "bdepth" in job:
                env_dict["DRQUEUE_BDEPTH"] = job["bdepth"]
            if "custom_zdepth" in job:
                env_dict["DRQUEUE_CUSTOM_ZDEPTH"] = job["custom_zdepth"]
            if "zdepth" in job:
                env_dict["DRQUEUE_ZDEPTH"] = job["zdepth"]
            if "custom_cracks" in job:
                env_dict["DRQUEUE_CUSTOM_CRACKS"] = job["custom_cracks"]
            if "cracks" in job:
                env_dict["DRQUEUE_CRACKS"] = job["cracks"]
            if "custom_quality" in job:
                env_dict["DRQUEUE_CUSTOM_QUALITY"] = job["custom_quality"]
            if "quality" in job:
                env_dict["DRQUEUE_QUALITY"] = job["quality"]
            if "custom_qfiner" in job:
                env_dict["DRQUEUE_CUSTOM_QFINER"] = job["custom_qfiner"]
            if "qfiner" in job:
                env_dict["DRQUEUE_QFINER"] = job["qfiner"]
            if "custom_smultiplier" in job:
                env_dict["DRQUEUE_CUSTOM_SMULTIPLIER"] = job["custom_smultiplier"]
            if "smultiplier" in job:
                env_dict["DRQUEUE_SMULTIPLIER"] = job["smultiplier"]
            if "custom_mpcache" in job:
                env_dict["DRQUEUE_CUSTOM_MPCACHE"] = job["custom_mpcache"]
            if "mpcache" in job:
                env_dict["DRQUEUE_MPCACHE"] = job["mpcache"]
            if "custom_smpolygon" in job:
                env_dict["DRQUEUE_CUSTOM_SMPOLYGON"] = job["custom_smpolygon"]
            if "smpolygon" in job:
                env_dict["DRQUEUE_SMPOLYGON"] = job["smpolygon"]
            if "custom_wh" in job:
                env_dict["DRQUEUE_CUSTOM_WH"] = job["custom_wh"]
            if "custom_type" in job:
                env_dict["DRQUEUE_CUSTOM_TYPE"] = job["custom_type"]
            if "ctype" in job:
                env_dict["DRQUEUE_CTYPE"] = job["ctype"]
            if "skipframes" in job:
                env_dict["DRQUEUE_SKIPFRAMES"] = job["skipframes"]

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job["renderer"])
            ar = self.lbview.apply(DrQueue.run_script_with_env, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()
        return True

    def identify_computer(self, engine_id, cache_time):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine["date"] + cache_time):
            print ("DEBUG: Engine %i was found in DB" % engine_id)
        # store new info
        else:
            print ("DEBUG: Engine %i was not found in DB" % engine_id)
            # run command only on specific computer
            dview = self.ip_client[engine_id]
            dview.block = True
            dview.execute(
                "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer("
                + str(engine_id)
                + ")"
            )
            engine = dview["engine"]
            engine["date"] = int(time.time())
            DrQueueComputer.store_db(engine)
        return engine

    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar

    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()

    def query_running_job_list(self):
        """Query a list of all running jobs"""
        jobs = DrQueueJob.query_job_list()
        running_jobs = []
        for job in jobs:
            if self.query_job_tasks_left(job["_id"]) > 0:
                running_jobs.append(job)
        return running_jobs

    def query_jobname(self, task_id):
        """Query jobname from task id"""
        data = self.ip_client.db_query({"msg_id": task_id})
        job_id = data[0]["header"]["session"]
        job = DrQueueJob.query_db(job_id)
        return job.name

    def query_job(self, job_id):
        """Query job from id"""
        return DrQueueJob.query_db(job_id)

    def query_job_by_name(self, job_name):
        """Query job from name"""
        return DrQueueJob.query_job_by_name(job_name)

    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task["completed"] == None:
                left += 1
        return left

    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job"""
        return self.ip_client.db_query({"header.session": str(job_id)})

    def query_task(self, task_id):
        """Query a single task"""
        task = self.ip_client.db_query({"msg_id": task_id})[0]
        return task

    def query_engine_list(self):
        """Query a list of all engines"""
        return self.ip_client.ids

    def query_engines_of_pool(self, pool_name):
        """Return available engines of certain pool."""
        pool_computers = self.ip_client.ids
        if pool_name != None:
            computers = DrQueueComputerPool.query_pool_members(pool_name)
            if computers == None:
                raise ValueError('Pool "%s" is not existing!' % pool_name)
                return False
            for comp in pool_computers:
                if not comp in computers:
                    pool_computers.remove(comp)
            if pool_computers == []:
                raise ValueError("No computer of pool %s is available!" % pool_name)
                return False
            print ("DEBUG: matching pool: " + pool_name)
            print (pool_computers)
        return pool_computers

    def query_engines_of_os(self, os_name):
        """Return only engines running certain OS."""
        # run job only on matching os
        matching_os = self.ip_client.ids
        if os_name != None:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                # os string has to contain os_name
                if not os_name in engine["os"]:
                    matching_os.remove(engine_id)
            print ("DEBUG: matching os: " + os_name)
            print (matching_os)
        return matching_os

    def query_engines_with_minram(self, minram):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching minram
        matching_minram = self.ip_client.ids
        if minram > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine["memory"] < minram:
                    matching_minram.remove(engine_id)
            print ("DEBUG: matching minram: " + str(minram))
            print (matching_minram)
        return matching_minram

    def query_engines_with_mincores(self, mincores):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching mincores
        matching_mincores = self.ip_client.ids
        if mincores > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine["ncorescpu"] * engine["ncpus"] < mincores:
                    matching_mincores.remove(engine_id)
            print ("DEBUG: matching mincores: " + str(mincores))
            print (matching_mincores)
        return matching_mincores

    def match_all_limits(self, os_list, minram_list, mincores_list, pool_list):
        """Match all limits for job."""
        tmp_list = []
        # build list with all list members
        tmp_list.extend(os_list)
        tmp_list.extend(minram_list)
        tmp_list.extend(mincores_list)
        tmp_list.extend(pool_list)
        # make entries unique
        tmp_list = set(tmp_list)
        tmp_list = list(tmp_list)
        matching_limits = []
        for entry in tmp_list:
            # look if entry is in all lists
            if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list):
                matching_limits.append(entry)
            else:
                print ("DEBUG: %i isn't matching limits" % entry)
        print ("DEBUG: matching limits:")
        print (matching_limits)
        if len(matching_limits) == 0:
            message = "No engine meets the requirements."
            print (message)
            raise Exception(message)
        elif len(matching_limits) > 0:
            # only run on matching engines
            self.lbview = self.ip_client.load_balanced_view(matching_limits)
        else:
            self.lbview = self.ip_client.load_balanced_view()

    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""
        tasks = self.query_task_list(job_id)
        # abort all queued tasks
        for task in tasks:
            self.ip_client.abort(task["msg_id"])
        return True

    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""
        tasks = self.query_task_list(job_id)
        running_engines = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status("all", True)
            # check if tasks is already running on an engine
            for key, status in stats.items():
                if ("tasks" in status) and (task["msg_id"] in status["tasks"]):
                    print "found"
                    running_engines.append(key)
            self.ip_client.abort(task["msg_id"])
        # restart all engines which still run a task
        running_engines = set(running_engines)
        print list(running_engines)
        # for engine_id in running_engines:
        #    self.ip_client(engine_id)
        return True

    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_engine_list()
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                self.ip_client.abort(task["msg_id"])
            self.ip_client.purge_results(task["msg_id"])
        # delete job itself
        DrQueueJob.delete_from_db(job_id)
        return True

    def task_continue(self, task_id):
        """Continue aborted or failed task"""
        task = self.query_task(task_id)
        # check if action is needed
        if (task["completed"] != None) and (
            (task["result_header"]["status"] == "error") or (task["result_header"]["status"] == "aborted")
        ):
            self.task_requeue(task_id)
        return True

    def task_requeue(self, task_id):
        """Requeue task"""
        self.ip_client.resubmit(task_id)
        print "requeuing %s" % task_id
        return True

    def job_continue(self, job_id):
        """Continue stopped job and all of it's tasks"""
        job = self.query_job(job_id)
        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])
        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])
        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])
        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])
        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)
        tasks = self.query_task_list(job_id)
        # continue tasks
        for task in tasks:
            self.task_continue(task["msg_id"])
        return True

    def job_rerun(self, job_id):
        """Run all tasks of job another time"""
        job = self.query_job(job_id)
        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])
        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])
        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])
        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])
        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)
        tasks = self.query_task_list(job_id)
        # rerun tasks
        for task in tasks:
            self.task_requeue(task["msg_id"])
        return True

    def job_status(self, job_id):
        """Return status string of job"""
        tasks = self.query_task_list(job_id)
        status = None
        status_pending = 0
        status_ok = 0
        status_aborted = 0
        status_resubmitted = 0
        status_error = 0
        for task in tasks:
            # look for pending tasks
            if task["completed"] == None:
                status_pending += 1
            else:
                if "result_header" in task.keys():
                    result_header = task["result_header"]
                    # look for done tasks
                    if ("status" in result_header.keys()) and (result_header["status"] == "ok"):
                        status_ok += 1
                    # look for aborted tasks
                    elif ("status" in result_header.keys()) and (result_header["status"] == "aborted"):
                        status_aborted += 1
                    # look for done tasks
                    elif ("status" in result_header.keys()) and (result_header["status"] == "resubmitted"):
                        status_resubmitted += 1
                    # look for tasks with error
                    elif ("status" in result_header.keys()) and (result_header["status"] == "error"):
                        status_error += 1
                    else:
                        status_unknown += 1
        # if at least 1 task is ok, job status is ok
        if status_ok > 0:
            status = "ok"
        # if at least 1 task is pending, job status is pending
        if status_pending > 0:
            status = "pending"
        # if at least 1 task is aborted, job status is aborted
        if status_aborted > 0:
            status = "aborted"
        # if at least 1 task has an error, job status is error
        if status_error > 0:
            status = "error"
        return status

    def engine_stop(self, engine_id):
        """Stop a specific engine"""
        # delete computer information in db
        DrQueueComputer.delete_from_db(engine_id)
        # shutdown computer
        self.ip_client.shutdown(engine_id)
        return True

    def engine_restart(self, engine_id):
        """Restart a specific engine"""
        self.ip_client.shutdown(engine_id, True, False, True)
        return True
예제 #8
0
    "--profile",
    dest="client_profile",
    default="unissh",
    action="store_const",
    help="the profile to use for ipython.parallel",
)
options, args = opt_parser.parse_args()

# START: create remote evaluators and a few (or one) special one for #
# generating new points
logger.info("init")
from IPython.parallel import Client, require

c = Client(profile=options.client_profile)
c.clear()  # clears remote engines
c.purge_results("all")  # all results are memorized in the hub

if len(c.ids) < 2:
    raise Exception("I need at least 2 clients.")
nbGens = min(1, len(c.ids) - 1)
generators = c.load_balanced_view(c.ids[:nbGens])
evaluators = c.load_balanced_view(c.ids[nbGens:])

# MAX number of tasks in total
MAX = 5000
# length of test data, sent over the wire
DIMSIZE = 10
# when adding machines, this is the number of additional tasks
# beyond the number of free machines
new_extra = DIMSIZE
예제 #9
0
    def run(self):
        if self.session.get_client() == None:
            self.mylog.error("Not connected to a cluster.")
            return False

        # workaround for Ipython bug which makes everything slow,
        # create a new client, use it and delete it
        c = Client(profile='ssh')

        jcmd = self.session.opts.get_opt('jcmd')
        if jcmd == 'purge':
            num = 0
            query = c.db_query({'completed': {'$ne': None}}, ['msg_id'])
            for q in query:
                result = c.get_result(q['msg_id']).get()
                # filter on SB, node, task
                if self._check_result(result):
                    num += 1
                    c.purge_results(q['msg_id'])
            mylogger.userinfo(self.mylog,
                              str(num) + " cluster's hub results deleted.")

        elif jcmd == 'list':
            num = 0
            # query the hub DB for all the finished tasks and get IDs
            query = c.db_query({'completed': {
                '$ne': None
            }}, ['msg_id', 'completed', 'started'])
            # search for interesting results and print them
            for q in query:
                result = c.get_result(q['msg_id']).get()

                # filter on SB, node, task
                if self._check_result(result):
                    # skip results without error if wanted
                    if self.session.opts.get_opt(
                            'onlyerr') and result['err'] == '':
                        continue
                    num += 1
                    header = {'Task' : result['task'], 'Node' : result['node'],\
                          'SB' : result['SB'], \
                          'Completed' : q['completed'].replace(microsecond=0), \
                          'Started' : q['started'].replace(microsecond=0), \
                          'Exec time': q['completed'].replace(microsecond=0)-q['started'].replace(microsecond=0)}
                    data = {'Std Output': result['out'], 'Std Error': result['err'], \
                          'Command':result['command']}
                    print_jobs(header, data,
                               self.session.opts.get_opt('lines'))
            mylogger.userinfo(self.mylog, str(num) + " processes listed.")

        elif jcmd == 'running':
            num_r = 0
            num_q = 0
            # TODO: it should be "Started" not "submitted", unfortunately ipython does not set it
            query = c.db_query({'completed': None},
                               ['buffers', 'engine_uuid', 'submitted'])
            for q in query:
                # unpack the buffer of the sent jobs to obtain the arguments
                null, com, args = unpack_apply_message(q['buffers'])
                # filter on SB, node, task
                if self._check_result({
                        'node': args['node'],
                        'SB': args['SB'],
                        'task': args['task']
                }):

                    if q['engine_uuid'] == None:
                        if self.session.opts.get_opt('queue') == False:
                            continue
                        q['msg_id'] = q['msg_id'] + " (queue)"
                        num_q += 1
                    else:
                        num_r += 1

                    header = {'Msg_id' : q['msg_id'], 'Task' : args['task'], 'Node' : args['node'], 'SB' : args['SB'], \
                       'Started' : q['submitted'].replace(microsecond=0), \
                       'Extime': datetime.datetime.now().replace(microsecond=0) - q['submitted'].replace(microsecond=0)}

                    data = {'Command': com[0]}
                    print_jobs(header, data,
                               self.session.opts.get_opt('lines'))

            mylogger.userinfo(
                self.mylog, "Processes running: " + str(num_r) +
                ". In queue: " + str(num_q) + ".")

        elif jcmd == 'kill':
            print "TBI"

        #TODO: add a resubmit option to resubmit all tasks that failed http://ipython.org/ipython-doc/stable/parallel/parallel_task.html

        del c