Example #1
0
    def launch_job(self, exp_dict, savedir, command, job=None):
        """Submit a job job and save job dict and exp_dict."""
        add_job_utils()
        import haven_jobs_utils as hju

        # Check for duplicates
        if job is not None:
            assert self._assert_no_duplicates(job)

        hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)

        # Define paths
        workdir_job = os.path.join(savedir, "code")

        # Copy the experiment code into the experiment folder
        hu.copy_code(self.workdir + "/", workdir_job)

        # Run  command
        job_command = hju.get_job_command(self.job_config, command, savedir, workdir=workdir_job)
        job_id = hu.subprocess_call(job_command).replace("\n", "")

        # Verbose
        if self.verbose:
            print("Job_id: %s command: %s" % (job_id, command))

        job_dict = {"job_id": job_id, 
                      "started at (Montreal)":hu.time_to_montreal(),
                      "command":command}

        hu.save_json(hju.get_job_fname(savedir), job_dict)

        return job_dict
Example #2
0
    def kill_jobs(self):
        add_job_utils()
        import haven_jobs_utils as hju

        hu.check_duplicates(self.exp_list)

        pr = hu.Parallel()
        submit_dict = {}

        for exp_dict in self.exp_list:
            exp_id = hu.hash_dict(exp_dict)
            savedir = os.path.join(self.savedir_base, exp_id)
            fname = hju.get_job_fname(savedir)

            if os.path.exists(fname):
                job_id = hu.load_json(fname)['job_id']
                pr.add(hju.kill_job, self.api, job_id)
                submit_dict[exp_id] = 'KILLED'
            else:
                submit_dict[exp_id] = 'NoN-Existent'

        pr.run()
        pr.close()
        pprint.pprint(submit_dict)
        print("%d/%d experiments killed." % (len([ s for s in submit_dict.values() if 'KILLED' in s]),
                                                len(submit_dict)))
        return submit_dict
Example #3
0
def get_job_fname(savedir, job_fname=None):
    import haven_jobs_utils as hju
    if job_fname is None:
        fname = hju.get_job_fname(savedir)
    else:
        fname = os.path.join(savedir, job_fname)
    return fname
Example #4
0
    def _submit_job(self, exp_dict, command, reset, submit_dict={}):
        """Submit one job.

        It checks if the experiment exist and manages the special casses, e.g.,
        new experiment, reset, failed, job is already running, completed
        """
        add_job_utils()
        import haven_jobs_utils as hju

        # Define paths
        savedir = os.path.join(self.savedir_base, hu.hash_dict(exp_dict))
        fname = hju.get_job_fname(savedir)
        
        if not os.path.exists(fname):
            # Check if the job already exists
            job_dict = self.launch_job(exp_dict, savedir, command, job=None)
            job_id = job_dict['job_id']
            message = "SUBMITTED: Launching"

        elif reset:
            # Check if the job already exists
            job_id = hu.load_json(fname).get("job_id")
            hju.kill_job(self.api, job_id)
            hc.delete_and_backup_experiment(savedir)

            job_dict = self.launch_job(exp_dict, savedir, command, job=None)
            job_id = job_dict['job_id']
            message = "SUBMITTED: Resetting"

        else:
            job_id = hu.load_json(fname).get("job_id")
            job = hju.get_job(self.api, job_id)

            if job.alive or job.state == 'SUCCEEDED':
                # If the job is alive, do nothing
                message = 'IGNORED: Job %s' % job.state
                
            elif job.state in ["FAILED", "CANCELLED"]:
                message = "SUBMITTED: Retrying %s Job" % job.state
                job_dict = self.launch_job(exp_dict, savedir, command, job=job)
                job_id = job_dict['job_id']
            # This shouldn't happen
            else:
                raise ValueError('wtf')
        
        submit_dict[job_id] = message
Example #5
0
    def get_summary(self, failed_only=False, columns=None, max_lines=200):
        """[summary]
        
        Returns
        -------
        [type]
            [description]
        """
        add_job_utils()
        import haven_jobs_utils as hju
        # get job ids
        job_id_list = []
        for exp_dict in self.exp_list:
            exp_id = hu.hash_dict(exp_dict)
            savedir = os.path.join(self.savedir_base, exp_id)
            fname = hju.get_job_fname(savedir) 

            if os.path.exists(fname):
                job_id_list += [hu.load_json(fname)["job_id"]]

        jobs_dict = hju.get_jobs_dict(self.api, job_id_list)

        # fill summary
        summary_dict = {'table':[], 'status':[], 'logs_failed':[], 'logs':[]}
        for exp_dict in self.exp_list:
            result_dict = copy.deepcopy(exp_dict)
            
            exp_id = hu.hash_dict(exp_dict)
            savedir = os.path.join(self.savedir_base, exp_id)
            result_dict["exp_id"] = exp_id
            
            fname = hju.get_job_fname(savedir)

            # Job results
            result_dict["job_id"] = None
            result_dict["job_state"] = 'NEVER LAUNCHED'

            if os.path.exists(fname):
                job_dict = hu.load_json(fname)
                job_id = job_dict["job_id"]
                if job_id not in jobs_dict:
                    continue

                job = jobs_dict[job_id]
                result_dict['started at (Montreal)'] = job_dict["started at (Montreal)"]
                result_dict["job_id"] = job_id
                result_dict["job_state"] = job.state
                
                summary_dict['table'] += [copy.deepcopy(result_dict)]
                
                result_dict["command"] = job.command[2]
                if job.state == "FAILED":
                    fname = os.path.join(savedir, "err.txt")
                    if os.path.exists(fname):
                        result_dict["logs"] = hu.read_text(fname)[-max_lines:]
                        summary_dict['logs_failed'] += [result_dict]
                    else:
                        if self.verbose:
                            print('%s: err.txt does not exist' % exp_id)
                else:
                    fname = os.path.join(savedir, "logs.txt")
                    if os.path.exists(fname):
                        result_dict["logs"] = hu.read_text(fname)[-max_lines:]
                        summary_dict['logs'] += [result_dict]
                    else:
                        if self.verbose:
                            print('%s: logs.txt does not exist' % exp_id)
            else:
                result_dict['job_state'] = 'NEVER LAUNCHED'
                summary_dict['table'] += [copy.deepcopy(result_dict)]
        # get info
        df = pd.DataFrame(summary_dict['table'])
        df = df.set_index('exp_id')
        if columns:
            df = df[[c for c in columns if (c in df.columns and c not in ['err'])]]

        if "job_state" in df:
            stats = np.vstack(np.unique(df['job_state'].fillna("NaN"),return_counts=True)).T
            status = ([{a:b} for (a,b) in stats])
        else:
            df['job_state'] = None

        summary_dict['status'] = status
        summary_dict['table'] = df
        summary_dict['queuing'] = df[df['job_state']=='QUEUING'] 
        summary_dict['running'] = df[df['job_state']=='RUNNING'] 
        summary_dict['succeeded'] = df[df['job_state']=='SUCCEEDED'] 
        summary_dict['failed'] = df[df['job_state']=='FAILED']

        return summary_dict