예제 #1
0
class Client():
    """DrQueue client actions"""
    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True

        # list of all available query keys
        self.all_task_query_keys = ['msg_id', 'header', 'content', 'buffers', 'submitted', 'client_uuid', 'engine_uuid', 'started', 'completed', 'resubmitted', 'result_header', 'result_content', 'result_buffers', 'queue', 'pyin', 'pyout', 'pyerr', 'stdout', 'stderr']


    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job['name'] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job['name'])
            return False

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job['owner']

        # set number of retries for each task
        self.lbview.retries = job['retries']

        # depend on another job (it's tasks)
        if ('depend' in job['limits']) and (job['limits']['depend'] != None):
            depend_job = self.query_job_by_name(job['limits']['depend'])
            depend_tasks = self.query_task_list(depend_job['_id'])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task['msg_id'])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job['startframe'] >= 1):
            raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= 1):
            raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= job['startframe']):
            raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.")
            return False
        if job['endframe'] > job['startframe']:
            if not (job['endframe'] - job['startframe'] >= job['blocksize']):
                raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.")
                return False
        if job['endframe'] == job['startframe']:
            if job['blocksize'] != 1:
                raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.")
                return False

        task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize']))
        ar = None
        for x in task_frames:

            # prepare script input
            env_dict = {
            'DRQUEUE_FRAME' : x,
            'DRQUEUE_BLOCKSIZE' : job['blocksize'],
            'DRQUEUE_ENDFRAME' : job['endframe'],
            'DRQUEUE_SCENEFILE' : job['scenefile']
            }

            # log filename
            if job['created_with'] == "DrQueueOnRails":
                # take job directory name
                env_dict['DRQUEUE_LOGFILE'] = job['scenefile'].split("/")[-2] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"
            else:
                # take job name
                env_dict['DRQUEUE_LOGFILE'] = job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"

            # optional elements
            if 'renderdir' in job:
                env_dict['DRQUEUE_RENDERDIR'] = job['renderdir']
            if 'projectdir' in job:
                env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir']
            if 'configdir' in job:
                env_dict['DRQUEUE_CONFIGDIR'] = job['configdir']
            if 'imagefile' in job:
                env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile']
            if 'precommand' in job:
                env_dict['DRQUEUE_PRECOMMAND'] = job['precommand']
            if 'renderer' in job:
                env_dict['DRQUEUE_RENDERER'] = job['renderer']
            if 'fileformat' in job:
                env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat']
            if 'postcommand' in job:
                env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand']
            if 'viewcommand' in job:
                env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand']
            if 'worldfile' in job:
                env_dict['DRQUEUE_WORLDFILE'] = job['worldfile']
            if 'terrainfile' in job:
                env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile']
            if 'composition' in job:
                env_dict['DRQUEUE_COMPOSITION'] = job['composition']
            if 'camera' in job:
                env_dict['DRQUEUE_CAMERA'] = job['camera']
            if 'resx' in job:
                env_dict['DRQUEUE_RESX'] = job['resx']
            if 'resy' in job:
                env_dict['DRQUEUE_RESY'] = job['resy']
            if 'renderpass' in job:
                env_dict['DRQUEUE_RENDERPASS'] = job['renderpass']
            if 'rendertype' in job:
                env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype']
            if 'fileextension' in job:
                env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension']
            if 'stepframe' in job:
                env_dict['DRQUEUE_STEPFRAME'] = job['stepframe']
            if 'custom_bucket' in job:
                env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket']
            if 'bucketsize' in job:
                env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize']
            if 'custom_lod' in job:
                env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod']
            if 'lod' in job:
                env_dict['DRQUEUE_LOD'] = job['lod']
            if 'custom_varyaa' in job:
                env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa']
            if 'varyaa' in job:
                env_dict['DRQUEUE_VARYAA'] = job['varyaa']
            if 'raytrace' in job:
                env_dict['DRQUEUE_RAYTRACE'] = job['raytrace']
            if 'antialias' in job:
                env_dict['DRQUEUE_ANTIALIAS'] = job['antialias']
            if 'custom_bdepth' in job:
                env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth']
            if 'bdepth' in job:
                env_dict['DRQUEUE_BDEPTH'] = job['bdepth']
            if 'custom_zdepth' in job:
                env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth']
            if 'zdepth' in job:
                env_dict['DRQUEUE_ZDEPTH'] = job['zdepth']
            if 'custom_cracks' in job:
                env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks']
            if 'cracks' in job:
                env_dict['DRQUEUE_CRACKS'] = job['cracks']
            if 'custom_quality' in job:
                env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality']
            if 'quality' in job:
                env_dict['DRQUEUE_QUALITY'] = job['quality']
            if 'custom_qfiner' in job:
                env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner']
            if 'qfiner' in job:
                env_dict['DRQUEUE_QFINER'] = job['qfiner']
            if 'custom_smultiplier' in job:
                env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier']
            if 'smultiplier' in job:
                env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier']
            if 'custom_mpcache' in job:
                env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache']
            if 'mpcache' in job:
                env_dict['DRQUEUE_MPCACHE'] = job['mpcache']
            if 'custom_smpolygon' in job:
                env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon']
            if 'smpolygon' in job:
                env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon']
            if 'custom_wh' in job:
                env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh']
            if 'custom_type' in job:
                env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type']
            if 'ctype' in job:
                env_dict['DRQUEUE_CTYPE'] = job['ctype']
            if 'skipframes' in job:
                env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes']

            # set dependencies
            dep_dict = {}
            dep_dict['job_id'] = job_id
            if ('os' in job['limits']) and (job['limits']['os'] != None):
                dep_dict['os_name'] = job['limits']['os']
            if ('minram' in job['limits']) and (job['limits']['minram'] > 0):
                dep_dict['minram'] = job['limits']['minram']
            if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0):
                dep_dict['mincores'] = job['limits']['mincores']
            if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None):
                dep_dict['pool_name'] = job['limits']['pool_name']
            run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict)

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job['renderer'])
            ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()

        # append email task behind last task if requested
        if ('send_email' in job) and (job['send_email'] == True):
            self.lbview.after = ar
            # run email task
            mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients'])
            # wait for pyzmq send to complete communication (avoid race condition)
            mail_ar.wait_for_send()
        return True


    def identify_computer(self, engine_id, cache_time, timeout=15):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db_by_engine_id(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine['created_at'] + cache_time):
            print("DEBUG: Engine %i was found in DB and info is up-to-date." % engine_id)
            return engine
        # store new info
        else:
            if engine != None:
                print("DEBUG: Engine %i was found in DB, but info needs to be updated." % engine_id)
            else:
                print("DEBUG: Engine %i was not found in DB." % engine_id)
            # run command only on specific computer
            try:
                dview = self.ip_client[engine_id]
            except IndexError:
                print("DEBUG: Engine with id %i unknown." % engine_id)
                # delete old entry from database
                DrQueueComputer.delete_from_db_by_engine_id(engine_id)
                print("DEBUG: Engine with id %i deleted from database." % engine_id)
                new_engine = None
            else:
                # run command in async mode
                dview.block = False
                command = "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer()"
                ar = dview.execute(command)
                try:
                    # try to get results & wait until timeout
                    ar.get(timeout)
                except Exception:
                    if engine != None:
                        print("DEBUG: Update request for engine %i timed out. Using old information from DB." % engine_id)
                        new_engine = engine
                    else:
                        print("DEBUG: Information request for engine %i timed out." % engine_id)
                        new_engine = None
                else:
                    # get computer dict from engine namespace
                    new_engine = dview['engine']
                    # set to known engine_id
                    new_engine['engine_id'] = engine_id
                    # set creation time
                    new_engine['created_at'] = int(time.time())
                    # store entry in database
                    DrQueueComputer.store_db(new_engine)
            return new_engine


    def computer_set_pools(self, computer, pool_list):
        """add computer to list of pools"""
        # convert to string
        pool_str = ','.join(pool_list)
        # update environment variable on engine
        dview = self.ip_client[computer['engine_id']]
        dview.block = True
        command = "import os\nos.environ[\"DRQUEUE_POOL\"] = \"" + pool_str + "\""
        dview.execute(command)
        # update database entry
        computer['pools'] = pool_list
        DrQueueComputer.store_db(computer)
        print("DEBUG: Engine " + str(computer['engine_id']) + " added to pools " + pool_str + ".")
        return computer


    def computer_get_pools(self, computer):
        """Return all pool names where computer is member."""
        return computer['pools']


    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar


    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()


    def query_job_by_id(self, job_id):
        """Query job by given id"""
        return DrQueueJob.query_db(job_id)


    def query_job_by_name(self, job_name):
        """Query job by given name"""
        return DrQueueJob.query_job_by_name(job_name)


    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task['completed'] == None:
                left += 1
        return left


    def query_job_finish_time(self, job_id):
        """Query oldest finish time of all tasks."""
        job = self.query_job_by_id(job_id)
        # use requeue time as starting point if available
        if ('requeue_time' in job ) and (job['requeue_time'] != False):
            finish_time = job['requeue_time']
        else:
            finish_time = job['submit_time']
        tasks = self.query_task_list(job_id)
        for task in tasks:
            # look if older finish time exists
            if (task['completed'] != None) and (task['completed'] > finish_time):
                finish_time = task['completed']
        return finish_time


    def get_frame_nr(self, task):
        """Extract value of DRQUEUE_FRAME."""
        if ('buffers' in task) and task['buffers'] != []:
            frame_nr = int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME'])
        else:
            frame_nr = 1
        return frame_nr


    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job. Sort by frame number."""
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys)
        sorted_task_list = sorted(task_list, key=self.get_frame_nr)
        return sorted_task_list


    def query_interrupted_task_list(self, job_id):
        """Query a list of interrupted tasks of certain job. Sort by frame number."""
        job = self.query_job_by_id(job_id)
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)}, keys=self.all_task_query_keys)
        interrupted_task_list = []

        for task in task_list:
            frame_nr = self.get_frame_nr(task)
            print("frame_nr: " + str(frame_nr))
            # log filename
            if job['renderer'] == "blender":
                filesearch = job['scenefile'] + str("%04d" % frame_nr) + ".???"
                found = glob.glob(filesearch)
                # file was found
                if len(found) > 0:
                    outputfile = found[0]
                    print("outputfile: "+ str(outputfile))
                    filesize = os.path.getsize(outputfile)
                    print(filesize)
                    # file exists, but is empty
                    if filesize == 0:
                        interrupted_task_list.append(task)
                # file was not found
                else:
                    outputfile = None
                    print("outputfile: "+ str(outputfile))
                    if (task['completed'] == None) and (task['started'] == None):
                        interrupted_task_list.append(task)
            else:
                raise ValueError("Only Blender renderer supported so far.")

        return interrupted_task_list


    def query_task(self, task_id):
        """Query a single task."""
        task = self.ip_client.db_query({'msg_id' : task_id }, keys=self.all_task_query_keys)[0]
        return task


    def query_computer_list(self):
        """Query a list of all computers."""
        return self.ip_client.ids


    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""

        # disable job
        self.job_disable(job_id)

        tasks = self.query_task_list(job_id)
        tasks_to_stop = []
        for task in tasks:
            print("Task " + task["msg_id"] + ": ")
            if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"):
                print("  finished at " + str(task["completed"]))
            else:
                # get task stats of all computers
                stats = self.ip_client.queue_status('all', True)
                # check if tasks is already running on an engine
                found_on_engine = False
                for key,status in list(stats.items()):
                    if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                        # skip tasks which are already running on an engine
                        print("  not finished yet but already queued to engine. will leave it there.")
                        found_on_engine = True
                        break

                # if a task isn't already queueed/running on an engine, it should be safe to abort it
                if found_on_engine == False:
                    print("  not finished yet. will abort.")
                    tasks_to_stop.append(task['msg_id'])

        if len(tasks_to_stop) > 0:
            try:
                self.ip_client.abort(tasks_to_stop)
            except Exception as e:
                print("ERROR: " + str(e))

        return True


    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""

        # disable job
        self.job_disable(job_id)

        tasks = self.query_task_list(job_id)
        running_engines = []
        tasks_to_stop = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
            tasks_to_stop.append(task['msg_id'])
        # stop all matching tasks at once
        try:
            self.ip_client.abort(tasks_to_stop)
        except Exception as e:
            print("ERROR: " + str(e))

        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)
        return True


    def job_disable(self, job_id):
        """Disable job in database."""
        job = self.query_job_by_id(job_id)
        job['enabled'] = False
        DrQueueJob.update_db(job)
        return True


    def job_enable(self, job_id):
        """Disable job in database."""
        job = self.query_job_by_id(job_id)
        job['enabled'] = True
        DrQueueJob.update_db(job)
        return True


    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_computer_list()
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                self.ip_client.abort(task['msg_id'])
            self.ip_client.purge_results(task['msg_id'])
        # delete job itself
        DrQueueJob.delete_from_db(job_id)
        return True


    def job_continue(self, job_id):
        """Continue stopped job and all of it's tasks"""
        job = self.query_job_by_id(job_id)

        # enable job
        self.job_enable(job_id)

        tasks = self.query_task_list(job_id)
        tasks_to_resubmit = []
        for task in tasks:
            print("Task " + task["msg_id"] + ": ")
            if ("result_header" in task) and (task["result_header"] != None) and (task["result_header"]["status"] == "ok"):
                print("  finished at " + str(task["completed"]))
            else:
                print("  not finished yet. will resubmit.")
                tasks_to_resubmit.append(task["msg_id"])

        if len(tasks_to_resubmit) > 0:

            # resubmit all matching msg_ids at once
            try:
                async_results = self.ip_client.resubmit(tasks_to_resubmit)
            except Exception as e:
                print("ERROR: " + str(e))

            # IPython seems to give out new msg_ids instead of re-using the old ones
            for msg_id in async_results.msg_ids:
                print("got new msg_id: " + msg_id)

            # delete old tasks which now have a resubmitted clone
            try:
                self.ip_client.purge_results(tasks_to_resubmit)
            except Exception as e:
                print("ERROR: " + str(e))

        return True


    def job_rerun(self, job_id):
        """Run all tasks of job another time"""
        job = self.query_job_by_id(job_id)

        # enable job
        job['enabled'] = True
        # set resubmit time
        job['requeue_time'] = datetime.datetime.now()
        DrQueueJob.update_db(job)

        tasks = self.query_task_list(job_id)
        tasks_to_resubmit = []
        # get all msg_ids of job
        for task in tasks:
            tasks_to_resubmit.append(task["msg_id"])

        # resubmit all msg_ids at once
        try:
            async_results = self.ip_client.resubmit(tasks_to_resubmit)
        except Exception as e:
            print("ERROR: " + str(e))

        # IPython seems to give out new msg_ids instead of re-using the old ones
        for msg_id in async_results.msg_ids:
            print("got new msg_id: " + msg_id)

        # delete old tasks which now have a resubmitted clone
        try:
            self.ip_client.purge_results(tasks_to_resubmit)
        except Exception as e:
            print("ERROR: " + str(e))

        # kickstart all computers
        running_engines = []
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)

        return True


    def task_rerun(self, task_id):
        """Run task another time"""
        task = self.query_task(task_id)

        #print(task)

        # enable job
        #job['enabled'] = True
        # set resubmit time
        #job['requeue_time'] = datetime.datetime.now()
        #DrQueueJob.update_db(job)

        # resubmit msg_id of task
        try:
            async_results = self.ip_client.resubmit(task["msg_id"])
        except Exception as e:
            print("ERROR: " + str(e))

        # IPython seems to give out new msg_ids instead of re-using the old ones
        for msg_id in async_results.msg_ids:
            print("got new msg_id: " + msg_id)

        # delete old tasks which now have a resubmitted clone
        try:
            self.ip_client.purge_results(task["msg_id"])
        except Exception as e:
            print("ERROR: " + str(e))

        # kickstart all computers
        running_engines = []
        stats = self.ip_client.queue_status('all', True)
        # check if tasks is already running on an engine
        for key,status in list(stats.items()):
            if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                running_engines.append(key)
        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)

        return True


    def job_rerun_interrupted_tasks(self, job_id):
        """Run interrupted tasks of job another time"""
        job = self.query_job_by_id(job_id)

        # enable job
        job['enabled'] = True
        # set resubmit time
        job['requeue_time'] = datetime.datetime.now()
        DrQueueJob.update_db(job)

        tasks = self.query_interrupted_task_list(job_id)

        if len(tasks) == 0:
            return True

        tasks_to_resubmit = []
        # get all msg_ids of job
        for task in tasks:
            tasks_to_resubmit.append(task["msg_id"])

        # resubmit all msg_ids at once
        try:
            async_results = self.ip_client.resubmit(tasks_to_resubmit)
        except Exception as e:
            print("ERROR: " + str(e))

        # IPython seems to give out new msg_ids instead of re-using the old ones
        for msg_id in async_results.msg_ids:
            print("got new msg_id: " + msg_id)

        # delete old tasks which now have a resubmitted clone
        try:
            self.ip_client.purge_results(tasks_to_resubmit)
        except Exception as e:
            print("ERROR: " + str(e))

        # kickstart all computers
        running_engines = []
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
        # stop all engines which still run a task
        # the slave wrapper will restart the engine
        running_engines = set(running_engines)
        for engine_id in running_engines:
            self.engine_stop(engine_id)

        return True


    def job_status(self, job_id):
        """Return status string of job"""
        tasks = self.query_task_list(job_id)
        status = None
        status_pending = 0
        status_ok = 0
        status_aborted = 0
        status_resubmitted = 0
        status_error = 0
        status_unknown = 0
        for task in tasks:
            # look for pending tasks
            if task['completed'] == None:
                status_pending += 1
            else:
                if 'result_content' in list(task.keys()):
                    result_content = task['result_content']
                    # look for done tasks
                    if ('status' in list(result_content.keys())) and (result_content['status'] == "ok"):
                        status_ok += 1
                    # look for aborted tasks
                    elif ('status' in list(result_content.keys())) and (result_content['status'] == "aborted"):
                        status_aborted += 1
                    # look for done tasks
                    elif ('status' in list(result_content.keys())) and (result_content['status'] == "resubmitted"):
                        status_resubmitted += 1
                    # look for tasks with error
                    elif ('status' in list(result_content.keys())) and (result_content['status'] == "error"):
                        status_error += 1
                    else:
                        status_unknown += 1
        # if at least 1 task is ok, job status is ok
        if status_ok > 0:
            status = "ok"
        # if at least 1 task has unknown status, job status is unknown
        if status_unknown > 0:
            status = "unknown"
        # if at least 1 task is pending, job status is pending
        if status_pending > 0:
            status = "pending"
        # if at least 1 task is aborted, job status is aborted
        if status_aborted > 0:
            status = "aborted"
        # if at least 1 task has an error, job status is error
        if status_error > 0:
            status = "error"
        return status


    def job_estimated_finish_time(self, job_id):
        """Calculate estimated finish time of job."""
        tasks = self.query_task_list(job_id)
        spent_times = []
        # get spent time for each finished task
        for task in tasks:
            if task['completed'] != None:
                if 'result_header' in list(task.keys()):
                    result_header = task['result_header']
                    if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"):
                        timediff = task['completed'] - task['started']
                        spent_times.append(timediff)
        if len(spent_times) > 0:
            # calculate sum of spent time
            sum_times = datetime.timedelta(0)
            for spent in spent_times:
                sum_times += spent
            # calcutate mean time for a single task
            sum_times_secs = sum_times.days * 86400 + sum_times.seconds
            meantime_secs = sum_times_secs / len(spent_times)
            meantime = datetime.timedelta(0, meantime_secs)
            # calculate estimated time left
            tasks_left = len(tasks) - len(spent_times)
            time_left = tasks_left * meantime
            # query job object
            job = self.query_job_by_id(job_id)
            # look if all tasks are already done
            if self.query_job_tasks_left(job_id) == 0:
                finish_time = self.query_job_finish_time(job_id)
            else:
                # calculate estimated finish time, use requeue time if available
                if ('requeue_time' in job ) and (job['requeue_time'] != False):
                    finish_time = job['requeue_time'] + time_left
                else:
                    finish_time = job['submit_time'] + time_left
        else:
            meantime = "unknown"
            time_left = "unknown"
            finish_time = "unknown"
        return meantime, time_left, finish_time


    def engine_stop(self, engine_id):
        """Stop a specific engine"""
        # delete computer information in db
        DrQueueComputer.delete_from_db_by_engine_id(engine_id)
        # we stop the engine
        try:
            self.ip_client.shutdown(engine_id, False, False, True)
        except Exception:
            return False
        return True
예제 #2
0
#Follow instructions under header "Using ipcluster in mpiexec/mpirun mode":
#https://ipython.org/ipython-doc/2/parallel/parallel_process.html#parallel-process
#Now, in a normal bash console run the following:
#ipcluster start --profile=mpi -n 4


from IPython.parallel import Client
c = Client(profile='mpi')
view = c[:]
view.activate() # enable magics

view.run('TestIPythonConsole.py')
view['rank']
#If expected output is all zeros, the iPython cluster has not been properly set up

#To shut down the cluster, run the following:
c.shutdown(hub=True)


#For distributed computing, read the following:
#http://stackoverflow.com/questions/33614100/setting-up-a-distributed-ipython-ipyparallel-mpi-cluster
예제 #3
0
class EngineManager(object):
    def __init__(self):
        self.profile = None
        self.started_controller = None
        self.started_engines = set()
        self._client = None

    def _select_profile(self):
        # See IPython.core.profileapp:list_profile_in()
        profiles = []
        for filename in os.listdir(get_ipython_dir()):
            if filename.startswith('profile_'):
                profiles.append(filename[8:])

        if profiles == ['default'] and not qt_available:
            self.profile = 'default'
        elif not qt_available:
            raise ValueError("'default' IPython profile does not exist "
                             "and PyQt4 is not available")
        else:
            self.profile = choose_profile(profiles)

    def ensure_controller(self, connect_only=False):
        """Make sure a controller is available, else start a local one.
        """
        if self._client:
            return self._client

        if self.profile is None:
            self._select_profile()
        if self.profile is None:
            return None
        print "parallelflow: using IPython profile %r" % self.profile

        try:
            self._client = Client(profile=self.profile)
            print "parallelflow: connected to controller"
            return self._client
        except error.TimeoutError:
            print "parallelflow: timeout when connecting to controller"
            if connect_only:
                start_ctrl = False
            elif qt_available:
                res = QtGui.QMessageBox.question(
                    None, "Start controller",
                    "Unable to connect to the configured IPython "
                    "controller. Do you want to start one?",
                    QtGui.QMessageBox.Yes | QtGui.QMessageBox.No)
                start_ctrl = res == QtGui.QMessageBox.Yes
            else:
                start_ctrl = True
        except IOError:
            print "parallelflow: didn't find a controller to connect to"
            if connect_only:
                start_ctrl = False
            elif qt_available:
                res = QtGui.QMessageBox.question(
                    None, "Start controller",
                    "No controller is configured in this IPython profile. "
                    "Do you want to start one?",
                    QtGui.QMessageBox.Yes | QtGui.QMessageBox.No)
                start_ctrl = res == QtGui.QMessageBox.Yes
            else:
                start_ctrl = True

        if start_ctrl:
            ctrl_pid = os.path.join(locate_profile(self.profile), 'pid',
                                    'ipcontroller.pid')
            if os.path.exists(ctrl_pid):
                os.remove(ctrl_pid)
            print "parallelflow: starting controller"
            proc, code = self.start_process(
                lambda: os.path.exists(ctrl_pid), sys.executable, '-m',
                'IPython.parallel.apps.ipcontrollerapp',
                '--profile=%s' % self.profile)
            if code is not None:
                if qt_available:
                    QtGui.QMessageBox.critical(
                        None, "Error", "Controller exited with code %d" % code)
                print(
                    "parallelflow: controller process exited with "
                    "code %d" % code)
                return None
            else:
                self.started_controller = proc
                print "parallelflow: controller started, connecting"
                self._client = Client(profile=self.profile)
                return self._client

        return None

    @staticmethod
    def start_process(condition, *args):
        """Executes a file and waits for a condition.
        """
        prev_dir = os.getcwd()
        os.chdir(os.path.join(vistrails_root_directory(), os.path.pardir))
        try:
            p = subprocess.Popen(args)
        finally:
            os.chdir(prev_dir)
        if condition is None:
            return p, None
        else:
            while True:
                time.sleep(0.5)
                if condition():
                    return p, None
                res = p.poll()
                if res is not None:
                    return None, res

    def start_engines(self, nb=None, prompt="Number of engines to start"):
        """Start some engines locally
        """
        c = self.ensure_controller()
        if c is None:
            if qt_available:
                QtGui.QMessageBox.warning(
                    None, "No controller",
                    "Can't start engines: couldn't connect to a "
                    "controller")
            print "parallelflow: no controller, not starting engines"
        else:
            if not nb and qt_available:
                nb, res = QtGui.QInputDialog.getInt(
                    None,
                    "Start engines",
                    prompt,
                    1,  # value
                    1,  # min
                    16)  # max
                if not res:
                    return
            elif nb is None:
                nb = 1
            print "parallelflow: about to start %d engines" % nb
            if qt_available:
                bar = QtGui.QProgressDialog("Starting engines...", None, 0, nb)

                def progress(n):
                    bar.setValue(n)

                bar.show()
            else:

                def progress(n):
                    pass

            progress(0)

            init_engines = set(c.ids)
            # Start the processes
            starting = set()
            for i in xrange(nb):
                proc, res = self.start_process(
                    None, sys.executable, '-m',
                    'IPython.parallel.apps.ipengineapp',
                    '--profile=%s' % self.profile)
                starting.add(proc)
            # Wait for each one to either fail or connect
            failed = []
            connected = 0
            while connected < len(starting):
                connected = len(set(c.ids) - init_engines)
                progress(len(failed) + connected)
                time.sleep(0.5)
                for p in list(starting):
                    res = p.poll()
                    if res is not None:
                        failed.append(res)
                        starting.remove(p)
            if failed:
                nb_failed = len(failed)
                if nb_failed > 3:
                    failed = "%s, ..." % (', '.join('%d' % f for f in failed))
                else:
                    failed = ', '.join('%d' % f for f in failed)
                if qt_available:
                    QtGui.QMessageBox.critical(
                        None, "Error", "%d engine(s) exited with codes: %s" %
                        (nb_failed, failed))
                print "parallelflow: %d engine(s) exited with codes: %s" % (
                    nb_failed, failed)
            self.started_engines.update(starting)

            if qt_available:
                bar.hide()
                bar.deleteLater()
            print "parallelflow: %d engines started" % nb

    def info(self):
        """Show some information on the cluster.
        """
        client = self.ensure_controller(connect_only=True)

        print "----- IPython information -----"
        print "profile: %s" % self.profile
        connected = client is not None
        print "connected to controller: %s" % ("yes" if connected else "no")
        st_ctrl = (self.started_controller is not None
                   and self.started_controller.poll() is None)
        print "controller started from VisTrails: %s" % ("running"
                                                         if st_ctrl else "no")
        st_engines = sum(1 for p in self.started_engines if p.poll() is None)
        print "engines started from VisTrails: %d" % st_engines
        if client is not None:
            nb_engines = len(client.ids)
        else:
            nb_engines = None
        print "total engines in cluster: %s" % (nb_engines if nb_engines
                                                is not None else "(unknown)")
        if connected and client.ids:
            dview = client[:]
            with dview.sync_imports():
                import os
                import platform
                import socket
            engines = dview.apply_async(
                eval,
                '(os.getpid(), platform.system(), socket.getfqdn())').get_dict(
                )
            engines = sorted(engines.items(),
                             key=lambda (ip_id, (pid, system, fqdn)):
                             (fqdn, ip_id))
            print "engines:"
            print "\tid\tsystem\tPID\tnode FQDN"
            print "\t--\t------\t---\t---------"
            for ip_id, (pid, system, fqdn) in engines:
                print "\t%d\t%s\t%d\t%s" % (ip_id, system, pid, fqdn)
        print ""

        if qt_available:
            dialog = QtGui.QDialog()
            layout = QtGui.QVBoxLayout()
            form = QtGui.QFormLayout()
            form.addRow("Profile:", QtGui.QLabel(self.profile))
            form.addRow("Connected:",
                        QtGui.QLabel("yes" if connected else "no"))
            form.addRow("Controller started from VisTrails:",
                        QtGui.QLabel("running" if st_ctrl else "no"))
            form.addRow("Engines started from VisTrails:",
                        QtGui.QLabel(str(st_engines)))
            form.addRow(
                "Total engines in cluster:",
                QtGui.QLabel(
                    str(nb_engines) if nb_engines is not None else "(unknown)")
            )
            layout.addLayout(form)
            if connected and client.ids:
                tree = QtGui.QTreeWidget()
                tree.setHeaderHidden(False)
                tree.setHeaderLabels(["IPython id", "PID", "System type"])
                engine_tree = dict()
                for ip_id, (pid, system, fqdn) in engines:
                    engine_tree.setdefault(fqdn, []).append(
                        (ip_id, pid, system))
                for fqdn, info in engine_tree.iteritems():
                    node = QtGui.QTreeWidgetItem([fqdn])
                    tree.addTopLevelItem(node)
                    tree.setFirstItemColumnSpanned(node, True)
                    for ip_id, pid, system in info:
                        node.addChild(
                            QtGui.QTreeWidgetItem(
                                [str(ip_id), str(pid), system]))
                for i in xrange(tree.columnCount()):
                    tree.resizeColumnToContents(i)
                tree.expandAll()
                layout.addWidget(tree)

            ok = QtGui.QPushButton("Ok")
            QtCore.QObject.connect(ok, QtCore.SIGNAL('clicked()'), dialog,
                                   QtCore.SLOT('accept()'))
            layout.addWidget(ok, 1, QtCore.Qt.AlignHCenter)
            dialog.setLayout(layout)
            dialog.exec_()

    def change_profile(self):
        self.cleanup()

        old_profile = self.profile
        self._select_profile()
        if not self.profile:
            self.profile = old_profile

        if self.profile != old_profile:
            # Here, the processes that were started but the user didn't want to
            # clean up are abandonned
            # They will continue running but later cleanups won't ask for these
            # ones
            self.started_engines = set()
            self.started_controller = None

    def cleanup(self):
        """Shut down the started processes (with user confirmation).
        """
        engines = sum(1 for p in self.started_engines if p.poll() is None)
        ctrl = (self.started_controller is not None
                and self.started_controller.poll() is None)
        print("parallelflow: cleanup: %s, %d engines running" %
              ("controller running" if ctrl else "no controller", engines))

        hub_shutdown = False

        if ctrl:
            if qt_available:
                res = QtGui.QMessageBox.question(
                    None, "Shutdown controller",
                    "The controller is still running. Do you want to stop "
                    "it?", QtGui.QMessageBox.Yes, QtGui.QMessageBox.No)
                res = res != QtGui.QMessageBox.No
            else:
                res = True
            if res:
                if self._client is not None:
                    self._client.shutdown(targets='all',
                                          restart=False,
                                          hub=True,
                                          block=False)
                    hub_shutdown = True
                    print "parallelflow: requested hub shutdown"
                else:
                    if self.started_controller.poll() is not None:
                        self.started_controller.terminate()
                        self.started_controller.wait()
                    print "parallelflow: controller terminated"
            self.started_controller = None

        if engines > 0 and not hub_shutdown:
            if qt_available:
                if self._client is not None:
                    total = " (among %d total)" % len(self._client.ids)
                else:
                    total = ''
                res = QtGui.QMessageBox.question(
                    None, "Shutdown engines",
                    "%d engines started here%s are still "
                    "running. Do you want to stop them?" % (engines, total),
                    QtGui.QMessageBox.Yes, QtGui.QMessageBox.No)
                res = res != QtGui.QMessageBox.No
            else:
                res = True
            if res:
                for engine in self.started_engines:
                    if engine.poll() is not None:
                        engine.terminate()
                        engine.wait()
                print("parallelflow: %d engines terminated" %
                      len(self.started_engines))
            self.started_engines = set()

        if self._client is not None:
            print "parallelflow: closing client"
            self._client.close()
            self._client = None

    def shutdown_cluster(self):
        """Use the client to request a shutdown of the whole cluster.
        """
        client = self.ensure_controller(connect_only=True)
        if client is None:
            if qt_available:
                QtGui.QMessageBox.information(
                    None, "Couldn't connect",
                    "Couldn't connect to a controller. Is the cluster "
                    "down already?")
            print(
                "parallelflow: shutdown_cluster requested, but could "
                "not connect to a controller")
            return

        if qt_available:
            res = QtGui.QMessageBox.question(
                None, "Shutdown cluster",
                "This will use the client connection to request the hub "
                "and every engine to shutdown. Continue?",
                QtGui.QMessageBox.Ok, QtGui.QMessageBox.Cancel)
            if res != QtGui.QMessageBox.Ok:
                return

        self._client.shutdown(targets='all',
                              restart=False,
                              hub=True,
                              block=False)
        print "parallelflow: cluster shutdown requested"
        self._client = None
예제 #4
0
class Client():
    """DrQueue client actions"""
    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True


    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job['name'] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job['name'])
            return False

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job['owner']

        # set number of retries for each task
        self.lbview.retries = job['retries']

        # depend on another job (it's tasks)
        if ('depend' in job['limits']) and (job['limits']['depend'] != None):
            depend_job = self.query_job_by_name(job['limits']['depend'])
            depend_tasks = self.query_task_list(depend_job['_id'])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task['msg_id'])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job['startframe'] >= 1):
            raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= 1):
            raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.")
            return False
        if not (job['endframe'] >= job['startframe']):
            raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.")
            return False
        if job['endframe'] > job['startframe']:
            if not (job['endframe'] - job['startframe'] >= job['blocksize']):
                raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.")
                return False
        if job['endframe'] == job['startframe']:
            if job['blocksize'] != 1:
                raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.")
                return False

        task_frames = list(range(job['startframe'], job['endframe'] + 1, job['blocksize']))
        ar = None
        for x in task_frames:
            # prepare script input
            env_dict = {
            'DRQUEUE_FRAME' : x,
            'DRQUEUE_BLOCKSIZE' : job['blocksize'],
            'DRQUEUE_ENDFRAME' : job['endframe'],
            'DRQUEUE_SCENEFILE' : job['scenefile'],
            'DRQUEUE_LOGFILE' : job['name'] + "-" + str(x) + "_" + str(x + job['blocksize'] -1) + ".log"
            }

            # optional elements
            if 'renderdir' in job:
                env_dict['DRQUEUE_RENDERDIR'] = job['renderdir']
            if 'projectdir' in job:
                env_dict['DRQUEUE_PROJECTDIR'] = job['projectdir']
            if 'configdir' in job:
                env_dict['DRQUEUE_CONFIGDIR'] = job['configdir']
            if 'imagefile' in job:
                env_dict['DRQUEUE_IMAGEFILE'] = job['imagefile']
            if 'precommand' in job:
                env_dict['DRQUEUE_PRECOMMAND'] = job['precommand']
            if 'renderer' in job:
                env_dict['DRQUEUE_RENDERER'] = job['renderer']
            if 'fileformat' in job:
                env_dict['DRQUEUE_FILEFORMAT'] = job['fileformat']
            if 'postcommand' in job:
                env_dict['DRQUEUE_POSTCOMMAND'] = job['postcommand']
            if 'viewcommand' in job:
                env_dict['DRQUEUE_VIEWCOMMAND'] = job['viewcommand']
            if 'worldfile' in job:
                env_dict['DRQUEUE_WORLDFILE'] = job['worldfile']
            if 'terrainfile' in job:
                env_dict['DRQUEUE_TERRAINFILE'] = job['terrainfile']
            if 'composition' in job:
                env_dict['DRQUEUE_COMPOSITION'] = job['composition']
            if 'camera' in job:
                env_dict['DRQUEUE_CAMERA'] = job['camera']
            if 'resx' in job:
                env_dict['DRQUEUE_RESX'] = job['resx']
            if 'resy' in job:
                env_dict['DRQUEUE_RESY'] = job['resy']
            if 'renderpass' in job:
                env_dict['DRQUEUE_RENDERPASS'] = job['renderpass']
            if 'rendertype' in job:
                env_dict['DRQUEUE_RENDERTYPE'] = job['rendertype']
            if 'fileextension' in job:
                env_dict['DRQUEUE_FILEEXTENSION'] = job['fileextension']
            if 'stepframe' in job:
                env_dict['DRQUEUE_STEPFRAME'] = job['stepframe']
            if 'custom_bucket' in job:
                env_dict['DRQUEUE_CUSTOM_BUCKET'] = job['custom_bucket']
            if 'bucketsize' in job:
                env_dict['DRQUEUE_BUCKETSIZE'] = job['bucketsize']
            if 'custom_lod' in job:
                env_dict['DRQUEUE_CUSTOM_LOD'] = job['custom_lod']
            if 'lod' in job:
                env_dict['DRQUEUE_LOD'] = job['lod']
            if 'custom_varyaa' in job:
                env_dict['DRQUEUE_CUSTOM_VARYAA'] = job['custom_varyaa']
            if 'varyaa' in job:
                env_dict['DRQUEUE_VARYAA'] = job['varyaa']
            if 'raytrace' in job:
                env_dict['DRQUEUE_RAYTRACE'] = job['raytrace']
            if 'antialias' in job:
                env_dict['DRQUEUE_ANTIALIAS'] = job['antialias']
            if 'custom_bdepth' in job:
                env_dict['DRQUEUE_CUSTOM_BDEPTH'] = job['custom_bdepth']
            if 'bdepth' in job:
                env_dict['DRQUEUE_BDEPTH'] = job['bdepth']
            if 'custom_zdepth' in job:
                env_dict['DRQUEUE_CUSTOM_ZDEPTH'] = job['custom_zdepth']
            if 'zdepth' in job:
                env_dict['DRQUEUE_ZDEPTH'] = job['zdepth']
            if 'custom_cracks' in job:
                env_dict['DRQUEUE_CUSTOM_CRACKS'] = job['custom_cracks']
            if 'cracks' in job:
                env_dict['DRQUEUE_CRACKS'] = job['cracks']
            if 'custom_quality' in job:
                env_dict['DRQUEUE_CUSTOM_QUALITY'] = job['custom_quality']
            if 'quality' in job:
                env_dict['DRQUEUE_QUALITY'] = job['quality']
            if 'custom_qfiner' in job:
                env_dict['DRQUEUE_CUSTOM_QFINER'] = job['custom_qfiner']
            if 'qfiner' in job:
                env_dict['DRQUEUE_QFINER'] = job['qfiner']
            if 'custom_smultiplier' in job:
                env_dict['DRQUEUE_CUSTOM_SMULTIPLIER'] = job['custom_smultiplier']
            if 'smultiplier' in job:
                env_dict['DRQUEUE_SMULTIPLIER'] = job['smultiplier']
            if 'custom_mpcache' in job:
                env_dict['DRQUEUE_CUSTOM_MPCACHE'] = job['custom_mpcache']
            if 'mpcache' in job:
                env_dict['DRQUEUE_MPCACHE'] = job['mpcache']
            if 'custom_smpolygon' in job:
                env_dict['DRQUEUE_CUSTOM_SMPOLYGON'] = job['custom_smpolygon']
            if 'smpolygon' in job:
                env_dict['DRQUEUE_SMPOLYGON'] = job['smpolygon']
            if 'custom_wh' in job:
                env_dict['DRQUEUE_CUSTOM_WH'] = job['custom_wh']
            if 'custom_type' in job:
                env_dict['DRQUEUE_CUSTOM_TYPE'] = job['custom_type']
            if 'ctype' in job:
                env_dict['DRQUEUE_CTYPE'] = job['ctype']
            if 'skipframes' in job:
                env_dict['DRQUEUE_SKIPFRAMES'] = job['skipframes']

            # set dependencies
            dep_dict = {}
            if ('os' in job['limits']) and (job['limits']['os'] != None):
                dep_dict['os_name'] = job['limits']['os']
            if ('minram' in job['limits']) and (job['limits']['minram'] > 0):
                dep_dict['minram'] = job['limits']['minram']
            if ('mincores' in job['limits']) and (job['limits']['mincores'] > 0):
                dep_dict['mincores'] = job['limits']['mincores']
            if ('pool_name' in job['limits']) and (job['limits']['pool_name'] != None):
                dep_dict['pool_name'] = job['limits']['pool_name']
            run_script_with_env_and_deps = dependent(DrQueue.run_script_with_env, DrQueue.check_deps, dep_dict)

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job['renderer'])
            ar = self.lbview.apply(run_script_with_env_and_deps, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()

        # append email task behind last task if requested
        if ('send_email' in job) and (job['send_email'] == True):
            self.lbview.after = ar
            # run email task
            mail_ar = self.lbview.apply(DrQueue.send_email, job['name'], job['email_recipients'])
            # wait for pyzmq send to complete communication (avoid race condition)
            mail_ar.wait_for_send()
        return True


    def identify_computer(self, engine_id, cache_time):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine['date'] + cache_time):
            print("DEBUG: Engine %i was found in DB" % engine_id)
        # store new info
        else:
            print("DEBUG: Engine %i was not found in DB" % engine_id)
            # run command only on specific computer
            dview = self.ip_client[engine_id]
            dview.block = True
            dview.execute("import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer(" + str(engine_id) + ")")
            engine = dview['engine']
            engine['date'] = int(time.time())
            DrQueueComputer.store_db(engine)
        return engine


    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar


    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()


    def query_running_job_list(self):
        """Query a list of all running jobs"""
        jobs = DrQueueJob.query_job_list()
        running_jobs = []
        for job in jobs:
            if self.query_job_tasks_left(job['_id']) > 0:
                running_jobs.append(job)
        return running_jobs


    def query_jobname(self, task_id):
        """Query jobname from task id"""
        data = self.ip_client.db_query({"msg_id" : task_id})
        job_id = data[0]['header']['session']
        job = DrQueueJob.query_db(job_id)
        return job.name


    def query_job(self, job_id):
        """Query job from id"""
        return DrQueueJob.query_db(job_id)


    def query_job_by_name(self, job_name):
        """Query job from name"""
        return DrQueueJob.query_job_by_name(job_name)


    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task['completed'] == None:
                left += 1
        return left


    def query_job_finish_time(self, job_id):
        """Query oldest finish time of all tasks."""
        job = self.query_job(job_id)
        # use requeue time as starting point if available
        if ('requeue_time' in job ) and (job['requeue_time'] != False):
            finish_time = job['requeue_time']
        else:
            finish_time = job['submit_time']
        tasks = self.query_task_list(job_id)
        for task in tasks:
            # look if older finish time exists
            if (task['completed'] != None) and (task['completed'] > finish_time):
                finish_time = task['completed']
        return finish_time


    def get_frame_nr(self, task):
        """Extract value of DRQUEUE_FRAME."""
        return int(pickle.loads(task['buffers'][3])['DRQUEUE_FRAME'])


    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job"""
        task_list =  self.ip_client.db_query({'header.session' : str(job_id)})
        sorted_task_list = sorted(task_list, key=self.get_frame_nr)
        return sorted_task_list


    def query_task(self, task_id):
        """Query a single task"""
        task = self.ip_client.db_query({'msg_id' : task_id })[0]
        return task


    def query_engine_list(self):
        """Query a list of all engines"""
        return self.ip_client.ids


    def query_engines_of_pool(self, pool_name):
        """Return available engines of certain pool."""
        pool_computers = self.ip_client.ids
        if pool_name != None:
            computers = DrQueueComputerPool.query_pool_members(pool_name)
            if computers == None:
                raise ValueError("Pool \"%s\" is not existing!" % pool_name)
                return False
            for comp in pool_computers:
                if not comp in computers:
                    pool_computers.remove(comp)
            if pool_computers == []:
                raise ValueError("No computer of pool %s is available!" % pool_name)
                return False
            print("DEBUG: matching pool: " + pool_name)
            print(pool_computers)
        return pool_computers


    def query_engines_of_os(self, os_name):
        """Return only engines running certain OS."""
        # run job only on matching os
        matching_os = self.ip_client.ids
        if os_name != None:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                # os string has to contain os_name
                if not os_name in engine['os']:
                    matching_os.remove(engine_id)
            print("DEBUG: matching os: " + os_name)
            print(matching_os)
        return matching_os


    def query_engines_with_minram(self, minram):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching minram
        matching_minram = self.ip_client.ids
        if minram > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine['memory'] < minram:
                    matching_minram.remove(engine_id)
            print("DEBUG: matching minram: " + str(minram))
            print(matching_minram)
        return matching_minram


    def query_engines_with_mincores(self, mincores):
        """Return only engines with at least mincores CPU cores."""
        # run job only on matching mincores
        matching_mincores = self.ip_client.ids
        if mincores > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine['ncorescpu'] * engine['ncpus'] < mincores:
                    matching_mincores.remove(engine_id)
            print("DEBUG: matching mincores: " + str(mincores))
            print(matching_mincores)
        return matching_mincores


    def match_all_limits(self, os_list, minram_list, mincores_list, pool_list):
        """Match all limits for job."""
        tmp_list = []
        # build list with all list members
        tmp_list.extend(os_list)
        tmp_list.extend(minram_list)
        tmp_list.extend(mincores_list)
        tmp_list.extend(pool_list)
        # make entries unique
        tmp_list = set(tmp_list)
        tmp_list = list(tmp_list)
        matching_limits = []
        for entry in tmp_list:
            # look if entry is in all lists
            if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list):
                matching_limits.append(entry)
            else:
                print("DEBUG: %i isn't matching limits" % entry)
        print("DEBUG: matching limits:")
        print(matching_limits)
        if len(matching_limits) == 0:
            message = "No engine meets the requirements."
            print(message)
            raise Exception(message)
        elif len(matching_limits) > 0:
            # only run on matching engines
            self.lbview = self.ip_client.load_balanced_view(matching_limits)
        else:
            self.lbview = self.ip_client.load_balanced_view()


    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""
        tasks = self.query_task_list(job_id)
        # abort all queued tasks
        for task in tasks:
            self.ip_client.abort(task['msg_id'])
        return True


    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""
        tasks = self.query_task_list(job_id)
        running_engines = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status('all', True)
            # check if tasks is already running on an engine
            for key,status in list(stats.items()):
                if ('tasks' in status) and (task['msg_id'] in status['tasks']):
                    running_engines.append(key)
            self.ip_client.abort(task['msg_id'])
        # restart all engines which still run a task
        running_engines = set(running_engines)
        return True


    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_engine_list()
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                self.ip_client.abort(task['msg_id'])
            self.ip_client.purge_results(task['msg_id'])
        # delete job itself
        DrQueueJob.delete_from_db(job_id)
        return True


    def task_continue(self, task_id):
        """Continue aborted or failed task"""
        task = self.query_task(task_id)
        # check if action is needed
        if (task['completed'] != None) and ((task['result_header']['status'] == "error") or (task['result_header']['status'] == "aborted")):
            self.task_requeue(task_id)
        return True


    def task_requeue(self, task_id):
        """Requeue task"""
        self.ip_client.resubmit(task_id)
        print("requeuing %s" % task_id)
        return True


    def job_continue(self, job_id):
        """Continue stopped job and all of it's tasks"""
        job = self.query_job(job_id)
        tasks = self.query_task_list(job_id)
        # continue tasks
        for task in tasks:
            self.task_continue(task['msg_id'])
        return True


    def job_rerun(self, job_id):
        """Run all tasks of job another time"""
        job = self.query_job(job_id)
        tasks = self.query_task_list(job_id)
        # rerun tasks
        for task in tasks:
            self.task_requeue(task['msg_id'])
        # set resubmit time
        job['requeue_time'] = datetime.datetime.now()
        DrQueueJob.update_db(job)
        return True


    def job_status(self, job_id):
        """Return status string of job"""
        tasks = self.query_task_list(job_id)
        status = None
        status_pending = 0
        status_ok = 0
        status_aborted = 0
        status_resubmitted = 0
        status_error = 0
        status_unknown = 0
        for task in tasks:
            # look for pending tasks
            if task['completed'] == None:
                status_pending += 1
            else:
                if 'result_header' in list(task.keys()):
                    result_header = task['result_header']
                    # look for done tasks
                    if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"):
                        status_ok += 1
                    # look for aborted tasks
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "aborted"):
                        status_aborted += 1
                    # look for done tasks
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "resubmitted"):
                        status_resubmitted += 1
                    # look for tasks with error
                    elif ('status' in list(result_header.keys())) and (result_header['status'] == "error"):
                        status_error += 1
                    else:
                        status_unknown += 1
        # if at least 1 task is ok, job status is ok
        if status_ok > 0:
            status = "ok"
        # if at least 1 task is pending, job status is pending
        if status_pending > 0:
            status = "pending"
        # if at least 1 task is aborted, job status is aborted
        if status_aborted > 0:
            status = "aborted"
        # if at least 1 task has an error, job status is error
        if status_error > 0:
            status = "error"
        return status


    def job_estimated_finish_time(self, job_id):
        """Calculate estimated finish time of job."""
        tasks = self.query_task_list(job_id)
        spent_times = []
        # get spent time for each finished task
        for task in tasks:
            if task['completed'] != None:
                if 'result_header' in list(task.keys()):
                    result_header = task['result_header']
                    if ('status' in list(result_header.keys())) and (result_header['status'] == "ok"):
                        timediff = task['completed'] - task['started']
                        spent_times.append(timediff)
        if len(spent_times) > 0:
            # calculate sum of spent time
            sum_times = datetime.timedelta(0)
            for spent in spent_times:
                sum_times += spent
            # calcutate mean time for a single task
            meantime = sum_times / len(spent_times)
            # calculate estimated time left
            tasks_left = len(tasks) - len(spent_times)
            time_left = tasks_left * meantime
            # query job object
            job = self.query_job(job_id)
            # look if all tasks are already done
            if self.query_job_tasks_left(job_id) == 0:
                finish_time = self.query_job_finish_time(job_id)
            else:
                # calculate estimated finish time, use requeue time if available
                if ('requeue_time' in job ) and (job['requeue_time'] != False):
                    finish_time = job['requeue_time'] + time_left
                else:
                    finish_time = job['submit_time'] + time_left
        else:
            meantime = "unknown"
            time_left = "unknown"
            finish_time = "unknown"
        return meantime, time_left, finish_time


    def engine_stop(self, engine_id):
        """Stop a specific engine"""
        # delete computer information in db
        DrQueueComputer.delete_from_db(engine_id)
        # shutdown computer
        self.ip_client.shutdown(engine_id)
        return True


    def engine_restart(self, engine_id):
        """Restart a specific engine"""
        self.ip_client.shutdown(engine_id, True, False, True)
        return True
예제 #5
0
class EngineManager(object):
    def __init__(self):
        self.profile = None
        self.started_controller = None
        self.started_engines = set()
        self._client = None

    def _select_profile(self):
        # See IPython.core.profileapp:list_profile_in()
        profiles = []
        for filename in os.listdir(get_ipython_dir()):
            if filename.startswith('profile_'):
                profiles.append(filename[8:])

        if profiles == ['default'] and not qt_available:
            self.profile = 'default'
        elif not qt_available:
            raise ValueError("'default' IPython profile does not exist "
                             "and PyQt4 is not available")
        else:
            self.profile = choose_profile(profiles)

    def ensure_controller(self, connect_only=False):
        """Make sure a controller is available, else start a local one.
        """
        if self._client:
            return self._client

        if self.profile is None:
            self._select_profile()
        if self.profile is None:
            return None
        print "parallelflow: using IPython profile %r" % self.profile

        try:
            self._client = Client(profile=self.profile)
            print "parallelflow: connected to controller"
            return self._client
        except error.TimeoutError:
            print "parallelflow: timeout when connecting to controller"
            if connect_only:
                start_ctrl = False
            elif qt_available:
                res = QtGui.QMessageBox.question(
                        None,
                        "Start controller",
                        "Unable to connect to the configured IPython "
                        "controller. Do you want to start one?",
                        QtGui.QMessageBox.Yes | QtGui.QMessageBox.No)
                start_ctrl = res == QtGui.QMessageBox.Yes
            else:
                start_ctrl = True
        except IOError:
            print "parallelflow: didn't find a controller to connect to"
            if connect_only:
                start_ctrl = False
            elif qt_available:
                res = QtGui.QMessageBox.question(
                        None,
                        "Start controller",
                        "No controller is configured in this IPython profile. "
                        "Do you want to start one?",
                        QtGui.QMessageBox.Yes | QtGui.QMessageBox.No)
                start_ctrl = res == QtGui.QMessageBox.Yes
            else:
                start_ctrl = True

        if start_ctrl:
            ctrl_pid = os.path.join(
                    locate_profile(self.profile),
                    'pid',
                    'ipcontroller.pid')
            if os.path.exists(ctrl_pid):
                os.remove(ctrl_pid)
            print "parallelflow: starting controller"
            proc, code = self.start_process(
                    lambda: os.path.exists(ctrl_pid),
                    sys.executable,
                    '-m',
                    'IPython.parallel.apps.ipcontrollerapp',
                    '--profile=%s' % self.profile)
            if code is not None:
                if qt_available:
                    QtGui.QMessageBox.critical(
                            None,
                            "Error",
                            "Controller exited with code %d" % code)
                print ("parallelflow: controller process exited with "
                       "code %d" % code)
                return None
            else:
                self.started_controller = proc
                print "parallelflow: controller started, connecting"
                self._client = Client(profile=self.profile)
                return self._client

        return None

    @staticmethod
    def start_process(condition, *args):
        """Executes a file and waits for a condition.
        """
        prev_dir = os.getcwd()
        os.chdir(os.path.join(vistrails_root_directory(), os.path.pardir))
        try:
            p = subprocess.Popen(args)
        finally:
            os.chdir(prev_dir)
        if condition is None:
            return p, None
        else:
            while True:
                time.sleep(0.5)
                if condition():
                    return p, None
                res = p.poll()
                if res is not None:
                    return None, res

    def start_engines(self, nb=None, prompt="Number of engines to start"):
        """Start some engines locally
        """
        c = self.ensure_controller()
        if c is None:
            if qt_available:
                QtGui.QMessageBox.warning(
                        None,
                        "No controller",
                        "Can't start engines: couldn't connect to a "
                        "controller")
            print "parallelflow: no controller, not starting engines"
        else:
            if not nb and qt_available:
                nb, res = QtGui.QInputDialog.getInt(
                        None,
                        "Start engines",
                        prompt,
                        1,  # value
                        1,  # min
                        16) # max
                if not res:
                    return
            elif nb is None:
                nb = 1
            print "parallelflow: about to start %d engines" % nb
            if qt_available:
                bar = QtGui.QProgressDialog(
                        "Starting engines...",
                        None,
                        0, nb)
                def progress(n):
                    bar.setValue(n)
                bar.show()
            else:
                def progress(n): pass
            progress(0)

            init_engines = set(c.ids)
            # Start the processes
            starting = set()
            for i in xrange(nb):
                proc, res = self.start_process(
                        None,
                        sys.executable,
                        '-m',
                        'IPython.parallel.apps.ipengineapp',
                        '--profile=%s' % self.profile)
                starting.add(proc)
            # Wait for each one to either fail or connect
            failed = []
            connected = 0
            while connected < len(starting):
                connected = len(set(c.ids) - init_engines)
                progress(len(failed) + connected)
                time.sleep(0.5)
                for p in list(starting):
                    res = p.poll()
                    if res is not None:
                        failed.append(res)
                        starting.remove(p)
            if failed:
                nb_failed = len(failed)
                if nb_failed > 3:
                    failed = "%s, ..." % (', '.join('%d' % f for f in failed))
                else:
                    failed = ', '.join('%d' % f for f in failed)
                if qt_available:
                    QtGui.QMessageBox.critical(
                        None,
                        "Error",
                        "%d engine(s) exited with codes: %s" % (
                        nb_failed, failed))
                print "parallelflow: %d engine(s) exited with codes: %s" % (
                        nb_failed, failed)
            self.started_engines.update(starting)

            if qt_available:
                bar.hide()
                bar.deleteLater()
            print "parallelflow: %d engines started" % (i + 1)

    def info(self):
        """Show some information on the cluster.
        """
        client = self.ensure_controller(connect_only=True)

        print "----- IPython information -----"
        print "profile: %s" % self.profile
        connected = client is not None
        print "connected to controller: %s" % (
                "yes" if connected else "no")
        st_ctrl = (self.started_controller is not None and
                        self.started_controller.poll() is None)
        print "controller started from VisTrails: %s" % (
                "running" if st_ctrl else "no")
        st_engines = sum(1 for p in self.started_engines if p.poll() is None)
        print "engines started from VisTrails: %d" % st_engines
        if client is not None:
            nb_engines = len(client.ids)
        else:
            nb_engines = None
        print "total engines in cluster: %s" % (
                nb_engines if nb_engines is not None else "(unknown)")
        if connected and client.ids:
            dview = client[:]
            with dview.sync_imports():
                import os
                import platform
                import socket
            engines = dview.apply_async(
                    eval,
                    '(os.getpid(), platform.system(), socket.getfqdn())'
            ).get_dict()
            engines = sorted(
                    engines.items(),
                    key=lambda (ip_id, (pid, system, fqdn)): (fqdn, ip_id))
            print "engines:"
            print "\tid\tsystem\tPID\tnode FQDN"
            print "\t--\t------\t---\t---------"
            for ip_id, (pid, system, fqdn) in engines:
                print "\t%d\t%s\t%d\t%s" % (ip_id, system, pid, fqdn)
        print ""

        if qt_available:
            dialog = QtGui.QDialog()
            layout = QtGui.QVBoxLayout()
            form = QtGui.QFormLayout()
            form.addRow(
                    "Profile:",
                    QtGui.QLabel(self.profile))
            form.addRow(
                    "Connected:",
                    QtGui.QLabel("yes" if connected else "no"))
            form.addRow(
                    "Controller started from VisTrails:",
                    QtGui.QLabel("running" if st_ctrl else "no"))
            form.addRow(
                    "Engines started from VisTrails:",
                    QtGui.QLabel(str(st_engines)))
            form.addRow(
                    "Total engines in cluster:",
                    QtGui.QLabel(str(nb_engines)
                                 if nb_engines is not None
                                 else "(unknown)"))
            layout.addLayout(form)
            if connected and client.ids:
                tree = QtGui.QTreeWidget()
                tree.setHeaderHidden(False)
                tree.setHeaderLabels(["IPython id", "PID", "System type"])
                engine_tree = dict()
                for ip_id, (pid, system, fqdn) in engines:
                    engine_tree.setdefault(fqdn, []).append(
                            (ip_id, pid, system))
                for fqdn, info in engine_tree.iteritems():
                    node = QtGui.QTreeWidgetItem([fqdn])
                    tree.addTopLevelItem(node)
                    tree.setFirstItemColumnSpanned(node, True)
                    for ip_id, pid, system in info:
                        node.addChild(QtGui.QTreeWidgetItem([
                                str(ip_id),
                                str(pid),
                                system]))
                for i in xrange(tree.columnCount()):
                    tree.resizeColumnToContents(i)
                tree.expandAll()
                layout.addWidget(tree)

            ok = QtGui.QPushButton("Ok")
            QtCore.QObject.connect(ok, QtCore.SIGNAL('clicked()'),
                                   dialog, QtCore.SLOT('accept()'))
            layout.addWidget(ok, 1, QtCore.Qt.AlignHCenter)
            dialog.setLayout(layout)
            dialog.exec_()

    def change_profile(self):
        self.cleanup()

        old_profile = self.profile
        self._select_profile()
        if not self.profile:
            self.profile = old_profile

        if self.profile != old_profile:
            # Here, the processes that were started but the user didn't want to
            # clean up are abandonned
            # They will continue running but later cleanups won't ask for these
            # ones
            self.started_engines = set()
            self.started_controller = None

    def cleanup(self):
        """Shut down the started processes (with user confirmation).
        """
        engines = sum(1 for p in self.started_engines if p.poll() is None)
        ctrl = (self.started_controller is not None and
                self.started_controller.poll() is None)
        print ("parallelflow: cleanup: %s, %d engines running" % (
               "controller running" if ctrl else "no controller",
               engines))

        hub_shutdown = False

        if ctrl:
            if qt_available:
                res = QtGui.QMessageBox.question(
                        None,
                        "Shutdown controller",
                        "The controller is still running. Do you want to stop "
                        "it?",
                        QtGui.QMessageBox.Yes,
                        QtGui.QMessageBox.No)
                res = res != QtGui.QMessageBox.No
            else:
                res = True
            if res:
                if self._client is not None:
                    self._client.shutdown(
                            targets='all',
                            restart=False,
                            hub=True,
                            block=False)
                    hub_shutdown = True
                    print "parallelflow: requested hub shutdown"
                else:
                    if self.started_controller.poll() is not None:
                        self.started_controller.terminate()
                        self.started_controller.wait()
                    print "parallelflow: controller terminated"
            self.started_controller = None

        if engines > 0 and not hub_shutdown:
            if qt_available:
                if self._client is not None:
                    total = " (among %d total)" % len(self._client.ids)
                else:
                    total = ''
                res = QtGui.QMessageBox.question(
                        None,
                        "Shutdown engines",
                        "%d engines started here%s are still "
                        "running. Do you want to stop them?" % (
                                engines,
                                total),
                        QtGui.QMessageBox.Yes,
                        QtGui.QMessageBox.No)
                res = res != QtGui.QMessageBox.No
            else:
                res = True
            if res:
                for engine in self.started_engines:
                    if engine.poll() is not None:
                        engine.terminate()
                        engine.wait()
                print ("parallelflow: %d engines terminated" %
                       len(self.started_engines))
            self.started_engines = set()

        if self._client is not None:
            print "parallelflow: closing client"
            self._client.close()
            self._client = None

    def shutdown_cluster(self):
        """Use the client to request a shutdown of the whole cluster.
        """
        client = self.ensure_controller(connect_only=True)
        if client is None:
            if qt_available:
                QtGui.QMessageBox.information(
                        None,
                        "Couldn't connect",
                        "Couldn't connect to a controller. Is the cluster "
                        "down already?")
            print ("parallelflow: shutdown_cluster requested, but could "
                   "not connect to a controller")
            return

        if qt_available:
            res = QtGui.QMessageBox.question(
                    None,
                    "Shutdown cluster",
                    "This will use the client connection to request the hub "
                    "and every engine to shutdown. Continue?",
                    QtGui.QMessageBox.Ok,
                    QtGui.QMessageBox.Cancel)
            if res != QtGui.QMessageBox.Ok:
                return

        self._client.shutdown(
                targets='all',
                restart=False,
                hub=True,
                block=False)
        print "parallelflow: cluster shutdown requested"
        self._client = None
예제 #6
0
class Client:
    """DrQueue client actions"""

    def __init__(self):
        # initialize IPython
        try:
            self.ip_client = IPClient()
        except Exception:
            raise Exception("Could not connect to IPython controller.")
        self.lbview = self.ip_client.load_balanced_view()

        # enable tracking
        self.lbview.track = True

    def job_run(self, job):
        """Create and queue tasks from job object"""

        # check job name
        if job["name"] in DrQueueJob.query_jobnames():
            raise ValueError("Job name %s is already used!" % job["name"])
            return False

        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])

        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])

        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])

        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])

        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)

        # save job in database
        job_id = DrQueueJob.store_db(job)

        # job_id from db is be used as session name
        self.ip_client.session.session = str(job_id)

        # set owner of job
        self.ip_client.session.username = job["owner"]

        # set number of retries for each task
        self.lbview.retries = job["retries"]

        # depend on another job (it's tasks)
        if ("depend" in job["limits"]) and (job["limits"]["depend"] != None):
            depend_job = self.query_job_by_name(job["limits"]["depend"])
            depend_tasks = self.query_task_list(depend_job["_id"])
            task_ids = []
            for task in depend_tasks:
                task_ids.append(task["msg_id"])
            self.lbview.after = task_ids

        # check frame numbers
        if not (job["startframe"] >= 1):
            raise ValueError("Invalid value for startframe. Has to be equal or greater than 1.")
            return False
        if not (job["endframe"] >= 1):
            raise ValueError("Invalid value for endframe. Has to be equal or greater than 1.")
            return False
        if not (job["endframe"] >= job["startframe"]):
            raise ValueError("Invalid value for endframe. Has be to equal or greater than startframe.")
            return False
        if job["endframe"] > job["startframe"]:
            if not (job["endframe"] - job["startframe"] >= job["blocksize"]):
                raise ValueError("Invalid value for blocksize. Has to be equal or lower than endframe-startframe.")
                return False
        if job["endframe"] == job["startframe"]:
            if job["blocksize"] != 1:
                raise ValueError("Invalid value for blocksize. Has to be equal 1 if endframe equals startframe.")
                return False

        task_frames = range(job["startframe"], job["endframe"] + 1, job["blocksize"])
        for x in task_frames:
            # prepare script input
            env_dict = {
                "DRQUEUE_FRAME": x,
                "DRQUEUE_BLOCKSIZE": job["blocksize"],
                "DRQUEUE_ENDFRAME": job["endframe"],
                "DRQUEUE_SCENEFILE": job["scenefile"],
                "DRQUEUE_LOGFILE": job["name"] + "-" + str(x) + "_" + str(x + job["blocksize"] - 1) + ".log",
            }

            # optional elements
            if "renderdir" in job:
                env_dict["DRQUEUE_RENDERDIR"] = job["renderdir"]
            if "projectdir" in job:
                env_dict["DRQUEUE_PROJECTDIR"] = job["projectdir"]
            if "configdir" in job:
                env_dict["DRQUEUE_CONFIGDIR"] = job["configdir"]
            if "imagefile" in job:
                env_dict["DRQUEUE_IMAGEFILE"] = job["imagefile"]
            if "precommand" in job:
                env_dict["DRQUEUE_PRECOMMAND"] = job["precommand"]
            if "renderer" in job:
                env_dict["DRQUEUE_RENDERER"] = job["renderer"]
            if "fileformat" in job:
                env_dict["DRQUEUE_FILEFORMAT"] = job["fileformat"]
            if "postcommand" in job:
                env_dict["DRQUEUE_POSTCOMMAND"] = job["postcommand"]
            if "viewcommand" in job:
                env_dict["DRQUEUE_VIEWCOMMAND"] = job["viewcommand"]
            if "worldfile" in job:
                env_dict["DRQUEUE_WORLDFILE"] = job["worldfile"]
            if "terrainfile" in job:
                env_dict["DRQUEUE_TERRAINFILE"] = job["terrainfile"]
            if "composition" in job:
                env_dict["DRQUEUE_COMPOSITION"] = job["composition"]
            if "camera" in job:
                env_dict["DRQUEUE_CAMERA"] = job["camera"]
            if "resx" in job:
                env_dict["DRQUEUE_RESX"] = job["resx"]
            if "resy" in job:
                env_dict["DRQUEUE_RESY"] = job["resy"]
            if "renderpass" in job:
                env_dict["DRQUEUE_RENDERPASS"] = job["renderpass"]
            if "rendertype" in job:
                env_dict["DRQUEUE_RENDERTYPE"] = job["rendertype"]
            if "fileextension" in job:
                env_dict["DRQUEUE_FILEEXTENSION"] = job["fileextension"]
            if "stepframe" in job:
                env_dict["DRQUEUE_STEPFRAME"] = job["stepframe"]
            if "custom_bucket" in job:
                env_dict["DRQUEUE_CUSTOM_BUCKET"] = job["custom_bucket"]
            if "bucketsize" in job:
                env_dict["DRQUEUE_BUCKETSIZE"] = job["bucketsize"]
            if "custom_lod" in job:
                env_dict["DRQUEUE_CUSTOM_LOD"] = job["custom_lod"]
            if "lod" in job:
                env_dict["DRQUEUE_LOD"] = job["lod"]
            if "custom_varyaa" in job:
                env_dict["DRQUEUE_CUSTOM_VARYAA"] = job["custom_varyaa"]
            if "varyaa" in job:
                env_dict["DRQUEUE_VARYAA"] = job["varyaa"]
            if "raytrace" in job:
                env_dict["DRQUEUE_RAYTRACE"] = job["raytrace"]
            if "antialias" in job:
                env_dict["DRQUEUE_ANTIALIAS"] = job["antialias"]
            if "custom_bdepth" in job:
                env_dict["DRQUEUE_CUSTOM_BDEPTH"] = job["custom_bdepth"]
            if "bdepth" in job:
                env_dict["DRQUEUE_BDEPTH"] = job["bdepth"]
            if "custom_zdepth" in job:
                env_dict["DRQUEUE_CUSTOM_ZDEPTH"] = job["custom_zdepth"]
            if "zdepth" in job:
                env_dict["DRQUEUE_ZDEPTH"] = job["zdepth"]
            if "custom_cracks" in job:
                env_dict["DRQUEUE_CUSTOM_CRACKS"] = job["custom_cracks"]
            if "cracks" in job:
                env_dict["DRQUEUE_CRACKS"] = job["cracks"]
            if "custom_quality" in job:
                env_dict["DRQUEUE_CUSTOM_QUALITY"] = job["custom_quality"]
            if "quality" in job:
                env_dict["DRQUEUE_QUALITY"] = job["quality"]
            if "custom_qfiner" in job:
                env_dict["DRQUEUE_CUSTOM_QFINER"] = job["custom_qfiner"]
            if "qfiner" in job:
                env_dict["DRQUEUE_QFINER"] = job["qfiner"]
            if "custom_smultiplier" in job:
                env_dict["DRQUEUE_CUSTOM_SMULTIPLIER"] = job["custom_smultiplier"]
            if "smultiplier" in job:
                env_dict["DRQUEUE_SMULTIPLIER"] = job["smultiplier"]
            if "custom_mpcache" in job:
                env_dict["DRQUEUE_CUSTOM_MPCACHE"] = job["custom_mpcache"]
            if "mpcache" in job:
                env_dict["DRQUEUE_MPCACHE"] = job["mpcache"]
            if "custom_smpolygon" in job:
                env_dict["DRQUEUE_CUSTOM_SMPOLYGON"] = job["custom_smpolygon"]
            if "smpolygon" in job:
                env_dict["DRQUEUE_SMPOLYGON"] = job["smpolygon"]
            if "custom_wh" in job:
                env_dict["DRQUEUE_CUSTOM_WH"] = job["custom_wh"]
            if "custom_type" in job:
                env_dict["DRQUEUE_CUSTOM_TYPE"] = job["custom_type"]
            if "ctype" in job:
                env_dict["DRQUEUE_CTYPE"] = job["ctype"]
            if "skipframes" in job:
                env_dict["DRQUEUE_SKIPFRAMES"] = job["skipframes"]

            # run task on cluster
            render_script = DrQueue.get_rendertemplate(job["renderer"])
            ar = self.lbview.apply(DrQueue.run_script_with_env, render_script, env_dict)
            # wait for pyzmq send to complete communication (avoid race condition)
            ar.wait_for_send()
        return True

    def identify_computer(self, engine_id, cache_time):
        """Gather information about computer"""
        # look if engine info is already stored
        engine = DrQueueComputer.query_db(engine_id)
        now = int(time.time())
        # check existence and age of info
        if (engine != None) and (now <= engine["date"] + cache_time):
            print ("DEBUG: Engine %i was found in DB" % engine_id)
        # store new info
        else:
            print ("DEBUG: Engine %i was not found in DB" % engine_id)
            # run command only on specific computer
            dview = self.ip_client[engine_id]
            dview.block = True
            dview.execute(
                "import DrQueue\nfrom DrQueue import Computer as DrQueueComputer\nengine = DrQueueComputer("
                + str(engine_id)
                + ")"
            )
            engine = dview["engine"]
            engine["date"] = int(time.time())
            DrQueueComputer.store_db(engine)
        return engine

    def task_wait(self, task_id):
        """Wait for task to finish"""
        ar = self.ip_client.get_result(task_id)
        ar.wait_for_send()
        ar.wait()
        return ar

    def query_job_list(self):
        """Query a list of all jobs"""
        return DrQueueJob.query_job_list()

    def query_running_job_list(self):
        """Query a list of all running jobs"""
        jobs = DrQueueJob.query_job_list()
        running_jobs = []
        for job in jobs:
            if self.query_job_tasks_left(job["_id"]) > 0:
                running_jobs.append(job)
        return running_jobs

    def query_jobname(self, task_id):
        """Query jobname from task id"""
        data = self.ip_client.db_query({"msg_id": task_id})
        job_id = data[0]["header"]["session"]
        job = DrQueueJob.query_db(job_id)
        return job.name

    def query_job(self, job_id):
        """Query job from id"""
        return DrQueueJob.query_db(job_id)

    def query_job_by_name(self, job_name):
        """Query job from name"""
        return DrQueueJob.query_job_by_name(job_name)

    def query_job_tasks_left(self, job_id):
        """Query left frames of job"""
        left = 0
        tasks = self.query_task_list(job_id)
        for task in tasks:
            if task["completed"] == None:
                left += 1
        return left

    def query_task_list(self, job_id):
        """Query a list of tasks objects of certain job"""
        return self.ip_client.db_query({"header.session": str(job_id)})

    def query_task(self, task_id):
        """Query a single task"""
        task = self.ip_client.db_query({"msg_id": task_id})[0]
        return task

    def query_engine_list(self):
        """Query a list of all engines"""
        return self.ip_client.ids

    def query_engines_of_pool(self, pool_name):
        """Return available engines of certain pool."""
        pool_computers = self.ip_client.ids
        if pool_name != None:
            computers = DrQueueComputerPool.query_pool_members(pool_name)
            if computers == None:
                raise ValueError('Pool "%s" is not existing!' % pool_name)
                return False
            for comp in pool_computers:
                if not comp in computers:
                    pool_computers.remove(comp)
            if pool_computers == []:
                raise ValueError("No computer of pool %s is available!" % pool_name)
                return False
            print ("DEBUG: matching pool: " + pool_name)
            print (pool_computers)
        return pool_computers

    def query_engines_of_os(self, os_name):
        """Return only engines running certain OS."""
        # run job only on matching os
        matching_os = self.ip_client.ids
        if os_name != None:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                # os string has to contain os_name
                if not os_name in engine["os"]:
                    matching_os.remove(engine_id)
            print ("DEBUG: matching os: " + os_name)
            print (matching_os)
        return matching_os

    def query_engines_with_minram(self, minram):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching minram
        matching_minram = self.ip_client.ids
        if minram > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine["memory"] < minram:
                    matching_minram.remove(engine_id)
            print ("DEBUG: matching minram: " + str(minram))
            print (matching_minram)
        return matching_minram

    def query_engines_with_mincores(self, mincores):
        """Return only engines with at least minram GB RAM."""
        # run job only on matching mincores
        matching_mincores = self.ip_client.ids
        if mincores > 0:
            for engine_id in self.ip_client.ids:
                engine = self.identify_computer(engine_id, 1000)
                if engine["ncorescpu"] * engine["ncpus"] < mincores:
                    matching_mincores.remove(engine_id)
            print ("DEBUG: matching mincores: " + str(mincores))
            print (matching_mincores)
        return matching_mincores

    def match_all_limits(self, os_list, minram_list, mincores_list, pool_list):
        """Match all limits for job."""
        tmp_list = []
        # build list with all list members
        tmp_list.extend(os_list)
        tmp_list.extend(minram_list)
        tmp_list.extend(mincores_list)
        tmp_list.extend(pool_list)
        # make entries unique
        tmp_list = set(tmp_list)
        tmp_list = list(tmp_list)
        matching_limits = []
        for entry in tmp_list:
            # look if entry is in all lists
            if (entry in os_list) and (entry in minram_list) and (entry in mincores_list) and (entry in pool_list):
                matching_limits.append(entry)
            else:
                print ("DEBUG: %i isn't matching limits" % entry)
        print ("DEBUG: matching limits:")
        print (matching_limits)
        if len(matching_limits) == 0:
            message = "No engine meets the requirements."
            print (message)
            raise Exception(message)
        elif len(matching_limits) > 0:
            # only run on matching engines
            self.lbview = self.ip_client.load_balanced_view(matching_limits)
        else:
            self.lbview = self.ip_client.load_balanced_view()

    def job_stop(self, job_id):
        """Stop job and all tasks which are not currently running"""
        tasks = self.query_task_list(job_id)
        # abort all queued tasks
        for task in tasks:
            self.ip_client.abort(task["msg_id"])
        return True

    def job_kill(self, job_id):
        """Stop job and all of it's tasks wether running or not"""
        tasks = self.query_task_list(job_id)
        running_engines = []
        # abort all queued tasks
        for task in tasks:
            stats = self.ip_client.queue_status("all", True)
            # check if tasks is already running on an engine
            for key, status in stats.items():
                if ("tasks" in status) and (task["msg_id"] in status["tasks"]):
                    print "found"
                    running_engines.append(key)
            self.ip_client.abort(task["msg_id"])
        # restart all engines which still run a task
        running_engines = set(running_engines)
        print list(running_engines)
        # for engine_id in running_engines:
        #    self.ip_client(engine_id)
        return True

    def job_delete(self, job_id):
        """Delete job and all of it's tasks"""
        tasks = self.query_task_list(job_id)
        engines = self.query_engine_list()
        # abort and delete all queued tasks
        for task in tasks:
            if len(engines) > 0:
                self.ip_client.abort(task["msg_id"])
            self.ip_client.purge_results(task["msg_id"])
        # delete job itself
        DrQueueJob.delete_from_db(job_id)
        return True

    def task_continue(self, task_id):
        """Continue aborted or failed task"""
        task = self.query_task(task_id)
        # check if action is needed
        if (task["completed"] != None) and (
            (task["result_header"]["status"] == "error") or (task["result_header"]["status"] == "aborted")
        ):
            self.task_requeue(task_id)
        return True

    def task_requeue(self, task_id):
        """Requeue task"""
        self.ip_client.resubmit(task_id)
        print "requeuing %s" % task_id
        return True

    def job_continue(self, job_id):
        """Continue stopped job and all of it's tasks"""
        job = self.query_job(job_id)
        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])
        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])
        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])
        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])
        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)
        tasks = self.query_task_list(job_id)
        # continue tasks
        for task in tasks:
            self.task_continue(task["msg_id"])
        return True

    def job_rerun(self, job_id):
        """Run all tasks of job another time"""
        job = self.query_job(job_id)
        # run job only on matching os
        os_list = self.query_engines_of_os(job["limits"]["os"])
        # run job only on matching minram
        minram_list = self.query_engines_with_minram(job["limits"]["minram"])
        # run job only on matching mincores
        mincores_list = self.query_engines_with_mincores(job["limits"]["mincores"])
        # check pool members
        pool_list = self.query_engines_of_pool(job["limits"]["pool"])
        # check limits
        self.match_all_limits(os_list, minram_list, mincores_list, pool_list)
        tasks = self.query_task_list(job_id)
        # rerun tasks
        for task in tasks:
            self.task_requeue(task["msg_id"])
        return True

    def job_status(self, job_id):
        """Return status string of job"""
        tasks = self.query_task_list(job_id)
        status = None
        status_pending = 0
        status_ok = 0
        status_aborted = 0
        status_resubmitted = 0
        status_error = 0
        for task in tasks:
            # look for pending tasks
            if task["completed"] == None:
                status_pending += 1
            else:
                if "result_header" in task.keys():
                    result_header = task["result_header"]
                    # look for done tasks
                    if ("status" in result_header.keys()) and (result_header["status"] == "ok"):
                        status_ok += 1
                    # look for aborted tasks
                    elif ("status" in result_header.keys()) and (result_header["status"] == "aborted"):
                        status_aborted += 1
                    # look for done tasks
                    elif ("status" in result_header.keys()) and (result_header["status"] == "resubmitted"):
                        status_resubmitted += 1
                    # look for tasks with error
                    elif ("status" in result_header.keys()) and (result_header["status"] == "error"):
                        status_error += 1
                    else:
                        status_unknown += 1
        # if at least 1 task is ok, job status is ok
        if status_ok > 0:
            status = "ok"
        # if at least 1 task is pending, job status is pending
        if status_pending > 0:
            status = "pending"
        # if at least 1 task is aborted, job status is aborted
        if status_aborted > 0:
            status = "aborted"
        # if at least 1 task has an error, job status is error
        if status_error > 0:
            status = "error"
        return status

    def engine_stop(self, engine_id):
        """Stop a specific engine"""
        # delete computer information in db
        DrQueueComputer.delete_from_db(engine_id)
        # shutdown computer
        self.ip_client.shutdown(engine_id)
        return True

    def engine_restart(self, engine_id):
        """Restart a specific engine"""
        self.ip_client.shutdown(engine_id, True, False, True)
        return True
예제 #7
0
파일: Bck.py 프로젝트: liquid-phynix/hydra
class BCK:
    def __init__(self, pipe, profile, pager_queue, jobs_idle, jobs_executing, jobs_finished):
        from IPython.parallel import Client
        self.jobs_idle, self.jobs_executing, self.jobs_finished = jobs_idle, jobs_executing, jobs_finished
        self.pipe, self.profile = pipe, profile
        self.client = Client(profile = profile)
        self.engines_idle = [Engine(self.client[id], jobs_executing) for id in self.client.ids]
        
        assign_gpuid_to_engines(self.engines_idle)

        self.engines_executing = []
        self.run = True
        self.run_scheduling = False
        self.pager_queue = pager_queue
        global _pager_queue
        _pager_queue = self.pager_queue
    def app(self):
        print('client: %d engines, with id\'s %s are up' % (len(self.client.ids), self.client.ids))       
        for engine in self.engines_idle: print('id %d on %s' % (engine.id, engine.hostname))
        self.pipe.send(Ready('all systems are a go'))
        yield bluelet.call(self.scheduler())
    def bluelet(self):
        bluelet.run(self.app())
    def scheduler(self):
        while self.run:
            if not self.pipe.poll():
                yield bluelet.null()
                if self.run_scheduling:
                    if len(self.engines_idle) > 0 and len(self.jobs_idle) > 0:
                        yield bluelet.spawn(self.schedule_job())
            else:
                recv = self.pipe.recv()
                BCK.__dict__[recv[0]](self, *recv[1:])
        yield bluelet.end()
    def schedule_job(self):
        unlucky = self.engines_idle.pop()
        self.engines_executing.append(unlucky)
        _,lucky = self.jobs_idle.popitem()
        lucky.engine_id = unlucky.id # assignment must happen before reinsertion
        self.jobs_executing[lucky.id] = lucky
        yield bluelet.call(unlucky.start_job(lucky.id))

        #        print('job finished: %s' % ('\n'.join(self.jobs_executing[lucky.id].output_queue)))
        self.jobs_finished[lucky.id] = self.jobs_executing[lucky.id]
        #        print('job finished: %s' % ('\n'.join(self.jobs_finished[lucky.id].output_queue)))
        del self.jobs_executing[lucky.id]
        unlucky.executing_job = None
        self.engines_executing.remove(unlucky)
        self.engines_idle.append(unlucky)
        yield bluelet.end()
    def stop_monitor(self, ack = True):
        self.run = False
        for engine in self.engines_executing:
            engine.apply(BCK.remote_command, 'stop_process')
            print('%s stopped' % engine)
        if ack: self.pipe.send('stop_monitor')
    def shutdown_all(self):
        self.stop_monitor(ack = False)
        self.client.shutdown(hub = True)
        self.pipe.send('shutdown_all')
    def list_engines(self):
        pr = '''
--- executing ---
%s
--- idle      ---
%s''' % ('\n'.join(map(str,self.engines_executing)), '\n'.join(map(str, self.engines_idle)))
        print(pr)
        self.pipe.send('list_engines')
    def start_scheduling(self):
        self.status_report(ack = False)
        self.run_scheduling = True
        self.pipe.send('start_scheduling')
    def stop_scheduling(self):
        self.status_report(ack = False)
        self.run_scheduling = False
        self.pipe.send('stop_scheduling')
    def status_report(self, ack = True):
        print('%d executing job(s)' % len(self.jobs_executing))
        print('%d finished job(s)' % len(self.jobs_finished))
        print('%d idle job(s)' % len(self.jobs_idle))
        if ack: self.pipe.send('status_report')
    def follow(self, id):
        if id in self.jobs_executing:
            while not self.pager_queue.empty():
                self.pager_queue.get()
            for line in self.jobs_executing[id].output_queue:
                self.pager_queue.put(line)
            self.pager_queue.activate(id)
            self.pipe.send('follow')
        elif id in self.jobs_finished:
            while not self.pager_queue.empty():
                self.pager_queue.get()
            for line in self.jobs_finished[id].output_queue:
                self.pager_queue.put(line)
            self.pipe.send('follow')
        else: raise ValueError('follow: \'job\' cannot be None')
    def unfollow(self, id):
        self.pager_queue.deactivate()
        self.pipe.send('unfollow')
    def remove(self, id):
        if id in self.jobs_executing:
            job = self.jobs_executing[id]
            for engine in self.engines_executing:
                if job.engine_id == engine.id:
                    engine.apply(BCK.remote_command, 'stop_process')
                    break
        self.pipe.send('remove')
################################################################################
# REMOTE
    @staticmethod
    def start_job(command, wdir, udir):
        from subprocess import Popen, PIPE
        import os
        global job
        os.chdir(wdir)
        if udir:
            try:
                os.mkdir(udir)
            except OSError:
                pass
            os.chdir(udir)
        print('pwd:')
        os.getcwd()
        #job = Popen(command.split(' '), shell = True, stdout = PIPE)
        # driver scripten keresztul lehet csak futtatni igy      
        job = Popen(command, shell = True, stdout = PIPE)
    @staticmethod
    def remote_command(command):
        global job
        if job is None:
            return 0, b''
        #            raise ValueError('\'job\' cannot be None')
        if command == 'stop_process':
            job.kill()
            ret = job.wait()
            job = None
            return ret
        if command == 'relay_stdout':
            return job.poll(), job.stdout.readline()
        else: raise ValueError('Wrong command <%s>' % command)
    @staticmethod
    def remote_system_command(cmd):
        import subprocess
        p = subprocess.Popen(cmd.split(' '), stdout = subprocess.PIPE)
        return p.stdout.readall().strip().decode()