def to_disk( self, job: Job, worker: Optional[str] = None, verbose: bool = False, ) -> None: """ saves parameter configuration for a single job to project_path. This allows Ludwig workers to find jobs """ if not job.is_ready(): raise SystemExit( 'Cannot save job. Job is not ready. Update job.param2val') # save parameter configuration to shared drive unique_id = f'{job.param2val["param_name"]}_{job.param2val["job_name"]}' p = self.project_path / f'{worker}_{unique_id}.pkl' with p.open('wb') as f: pickle.dump(job.param2val, f) # console print_ludwig(f'Parameter configuration for {worker} saved to disk') if verbose: print(job) print()
def update_param_name(self, runs_path: Path, num_new: int, # number of new param_names assigned ) -> None: """ check if param2val exists in runs. only if it doesn't exist, create a new one (otherwise problems with queued runs might occur) """ param_nums = [int(p.name.split('_')[-1]) for p in runs_path.glob('param*') if config.Constants.not_ludwig not in p.name] or [0] for param_p in runs_path.glob('param_*'): with (param_p / 'param2val.yaml').open('r') as f: loaded_param2val = yaml.load(f, Loader=yaml.FullLoader) if self.is_same(self.param2val, loaded_param2val): print_ludwig('Configuration matches existing configuration') self.is_new = False param_name = param_p.name break else: new_param_num = max(param_nums) + 1 + num_new param_nums.append(new_param_num) param_name = 'param_{:0>3}'.format(new_param_num) self.is_new = True self.param2val['param_name'] = param_name print_ludwig(f'Assigned job param_name={param_name}')
def add_ssh_config(): """ append contents of /media/research_data/.ludwig/config to ~/.ssh/ludwig_config """ src = config.WorkerDirs.research_data / '.ludwig' / 'config' dst = Path().home( ) / '.ssh' / 'ludwig_config' # do not overwrite existing config print_ludwig('Copying {} to {}'.format(src, dst)) shutil.copy(src, dst)
def calc_num_needed(self, runs_path: Path, reps: int, disable: bool = False, # for unit-testing or debugging ): param_name = self.param2val['param_name'] num_times_logged = len(list((runs_path / param_name).glob('*num*'))) num_times_logged = num_times_logged if not disable else 0 res = reps - num_times_logged res = max(0, res) print_ludwig('{:<10} logged {:>3} times. Will execute job {:>3} times'.format( param_name, num_times_logged, res)) return res
def start_jobs( self, worker: str, ) -> None: """ source code is uploaded. run.py is uploaded to worker, which triggers killing of existing jobs, and executes run.py. if no param2val for worker is saved to server, then run.py will exit. """ # -------------------------------------- checks assert self.project_name.lower( ) == self.src_name # TODO what about when src name must be different? # this must be true because in run.py project_name is converted to src_name self.check_disk_space() # -------------------------------------- prepare paths if not self.project_path.exists(): self.project_path.mkdir() if not self.runs_path.exists(): self.runs_path.mkdir(parents=True) remote_path = f'{config.WorkerDirs.watched.name}/{self.src_name}' # ------------------------------------- sftp # connect via sftp research_data_path = self.project_path.parent private_key_path = research_data_path / '.ludwig' / 'id_rsa' sftp = pysftp.Connection(username='******', host=self.worker2ip[worker], private_key=str(private_key_path)) # upload code files print_ludwig( f'Will upload {self.src_name} to {remote_path} on {worker}') sftp.makedirs(remote_path) sftp.put_r(localpath=self.src_name, remotepath=remote_path) # upload run.py run_file_name = f'run_{self.project_name}.py' sftp.put( localpath=run.__file__, remotepath=f'{config.WorkerDirs.watched.name}/{run_file_name}') print_ludwig(f'Upload to {worker} complete')
def check_disk_space(self, verbose=False): if platform.system() in {'Linux'}: p = self.project_path.parent usage_stats = psutil.disk_usage(str(p)) percent_used = usage_stats[3] if verbose: print_ludwig('Percent Disk Space used at {}: {}'.format( p, percent_used)) if percent_used > config.Remote.disk_max_percent: raise RuntimeError('Disk space usage > {}.'.format( config.Remote.disk_max_percent)) else: print_ludwig( 'WARNING: Cannot determine disk space on non-Linux platform.')
def calc_num_needed( self, runs_path: Path, reps: int, ): param_name = self.param2val['param_name'] num_times_logged = len(list((runs_path / param_name).glob('*num*'))) res = reps - num_times_logged res = max(0, res) print_ludwig( '{:<10} logged {:>3} times. Will execute job {:>3} times'.format( param_name, num_times_logged, res)) return res
def kill_jobs( self, worker: str, ) -> None: """ first kil all job descriptions for worker (pickle files saved on server). then, run.py is uploaded to worker, which triggers killing of existing jobs, and executes run.py. because no job descriptions for worker exist on server, run.py will exit. """ # -------------------------------------- checks assert self.project_name.lower( ) == self.src_name # TODO what about when src name must be different? # this must be true because in run.py project_name is converted to src_name self.check_disk_space() # -------------------------------------- prepare paths if not self.project_path.exists(): self.project_path.mkdir() if not self.runs_path.exists(): self.runs_path.mkdir(parents=True) # ------------------------------------- sftp # connect via sftp research_data_path = self.project_path.parent private_key_path = research_data_path / '.ludwig' / 'id_rsa' sftp = pysftp.Connection(username='******', host=self.worker2ip[worker], private_key=str(private_key_path)) # upload run.py - this triggers watcher which kills active jobs associated with project run_file_name = f'run_{self.project_name}.py' sftp.put( localpath=run.__file__, remotepath=f'{config.WorkerDirs.watched.name}/{run_file_name}') print_ludwig( f'Killed any active jobs with src_name={self.src_name} on {worker}' )
def gen_param_paths( project_name: str, param2requests: Dict[str, list], param2default: Dict[str, Any], runs_path: Optional[Path] = None, ludwig_data_path: Optional[Path] = None, label_params: Optional[List[str]] = None, isolated: bool = False, label_n: bool = True, verbose: bool = True, verbose_plus: bool = False, require_all_found: bool = True, ): """ Return path objects that point to folders with job results. Folders located in those paths are each generated with the same parameter configuration. Use this for retrieving data after a job has been completed """ # -------------------------------------------------------- paths if ludwig_data_path is None: ludwig_data_path = Path( default_mnt_point) / configs.WorkerDirs.ludwig_data.name if isolated: project_path = Path.cwd() else: project_path = ludwig_data_path / project_name # check that ludwig_data is mounted if not os.path.ismount(ludwig_data_path): raise OSError(f'{ludwig_data_path} is not mounted') if not runs_path: runs_path = project_path / 'runs' # get + check path to runs if runs_path is None: runs_path = ludwig_data_path / project_name / 'runs' if not runs_path.exists(): raise FileNotFoundError(f'{runs_path} does not exist.') # ------------------------------------------------------- prepare params label_params = sorted( set([ param for param, val in param2requests.items() if val != param2default[param] ] + (label_params or []))) requested_param2vals = list( gen_all_param2vals(param2requests, param2default)) print_ludwig('Looking for the following parameter configurations:') num_requested = 0 for requested_param2val in requested_param2vals: print(sorted(requested_param2val.items())) num_requested += 1 # look for param_paths num_found = 0 for param_path in sorted(runs_path.glob('param_*')): if verbose: print_ludwig(f'Checking {param_path}...') # load param2val with (param_path / 'param2val.yaml').open('r') as f: param2val = yaml.load(f, Loader=yaml.FullLoader) loaded_param2val = param2val.copy() for param_name in configs.Constants.added_param_names: try: del loaded_param2val[param_name] except KeyError: # Ludwig < v2.0 pass # is match? if loaded_param2val in requested_param2vals: num_found += 1 label_ = '\n'.join( [f'{param}={param2val[param]}' for param in label_params]) if label_n: n = len(list(param_path.glob('*num*'))) label_ += f'\nn={n}' if verbose: print_ludwig('Param2val matches') print_ludwig(label_) yield param_path, label_ else: if verbose: print_ludwig('Params do not match') if verbose_plus: for k in param2requests: for v in param2requests[k]: if loaded_param2val[k] != v: print_ludwig( f'For key "{k}", {v} does not match {loaded_param2val[k]}' ) if num_requested != num_found and require_all_found: raise SystemExit( f'Ludwig: Found {num_found} but requested {num_requested}')
def submit(): """ run jobs locally or on Ludwig workers. This script should be called in root directory of the Python project. If not specified via CL arguments, it will try to import src.params. src.params is where this script will try to find the parameters with which to execute your jobs. """ cwd = Path.cwd() project_name = cwd.name # parse cmd-line args parser = argparse.ArgumentParser() parser.add_argument('-src', '--src', default=cwd.name.lower(), action='store', dest='src', required=False, help='Specify path to your source code.') parser.add_argument( '-r', '--reps', default=1, action='store', dest='reps', type=int, choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50], required=False, help= 'Number of times each job will be executed. To kill all jobs, use 0.' ) # TODO test r 0 - it should kill all running jobs parser.add_argument( '-m', '--minimal', action='store_true', default=False, dest='minimal', required=False, help='Run minimal parameter configuration for debugging.') parser.add_argument('-l', '--local', action='store_true', default=False, dest='local', required=False, help='Run on host') parser.add_argument( '-i', '--isolated', action='store_true', default=False, dest='isolated', required=False, help= 'Do not connect to server. Only works when all data is available on client.' ) parser.add_argument( '-w', '--worker', default=None, action='store', dest='worker', choices=configs.Remote.online_worker_names, required=False, help='Specify a single worker name if submitting to single worker only' ) parser.add_argument('-g', '--group', default=None, action='store', dest='group', choices=configs.Remote.group2workers.keys(), required=False, help='Specify a worker group') parser.add_argument( '-x', '--clear_runs', action='store_true', default=False, dest='clear_runs', required=False, help= 'Delete all saved runs associated with current project on shared drive' ) parser.add_argument('-f', '--first_only', action='store_true', default=False, dest='first_only', required=False, help='Run first job and exit.') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) parser.add_argument( '-mnt', '--ludwig_data', default=None, action='store', dest='ludwig_data_path', required=False, help= 'Specify where the shared drive is mounted on your system (if not /media/ludwig_data).' ) parser.add_argument('-e', '--extra_paths', nargs='*', default=[], action='store', dest='extra_paths', required=False, help='Paths to additional Python packages or data. ') parser.add_argument( '-n', '--no-upload', action='store_true', dest='no_upload', required=False, help='Whether to upload jobs to Ludwig. Set false for testing') parser.add_argument( '-s', '--skip-hostkey', action='store_true', dest='skip_hostkey', required=False, default=False, help= 'Whether to skip hostkey checking. Unsafe, but may prevent SSH connection error.' ) namespace = parser.parse_args() # ---------------------------------------------- checks assert not (namespace.local and namespace.isolated) assert not (namespace.extra_paths and namespace.isolated) # ---------------------------------------------- paths if namespace.ludwig_data_path: ludwig_data_path = Path(namespace.ludwig_data_path) else: ludwig_data_path = Path( default_mnt_point) / configs.WorkerDirs.ludwig_data.name # project path points to wherever user decides a job should be executed: # locally: project_path is local project_path # remotely: project_path is on shared_drive if namespace.isolated: project_path = cwd else: project_path = ludwig_data_path / project_name runs_path = project_path / 'runs' src_path = cwd / namespace.src # ------------------------------------------------ user code # import user params + job print_ludwig(f'Importing source code from {src_path}') sys.path.append(str(cwd)) user_params = importlib.import_module(src_path.name + '.params') if namespace.local or namespace.isolated: # no need to import job when not executed locally user_job = importlib.import_module(src_path.name + '.job') # ------------------------------------------------ checks if not namespace.isolated: if not os.path.ismount(str(ludwig_data_path)): raise OSError(f'{ludwig_data_path} is not mounted') if not src_path.exists(): raise NotADirectoryError(f'Cannot find source code in {src_path}.') # check for mis-spelled param names for k in user_params.param2requests: if k not in user_params.param2requests: raise KeyError( f'Param "{k}" in param2requests is not in param2default. Check spelling' ) # check that requests are lists and that each list does not contain repeated values for k, v in user_params.param2requests.items(): if not isinstance(v, list): raise TypeError( 'Value of param2requests["{}"] must be a list.'.format(k)) for vi in v: if isinstance( vi, list ): # tuples can be members of a set (they are hashable) but not lists raise TypeError( 'Inner collections in param2requests must be of type tuple, not list.' ) if isinstance(vi, dict): raise TypeError( 'Inner collections in param2requests must not be of type dict' ) if len(v) != len( set(v) ): # otherwise each identical value will be assigned a unique param_name raise ValueError('Each requested parameter value must be unique') if k not in user_params.param2default: raise KeyError( '{} is not a key in param2default. Check spelling.'.format(k)) # check that there are no lists (only tuples) in param2default for k, v in user_params.param2default.items(): if isinstance( v, list ): # tuples can be members of a set (they are hashable) but not lists raise TypeError( 'Type list is not allowed in param2default. Convert any lists to tuples.' ) # --------------------------------------------- if not namespace.isolated and not namespace.local: # are additional source code files required? (do this before killing active jobs) # these can be Python packages, which will be importable, or contain data. # extra_paths is only allowed to be non-empty if not --local for extra_path in namespace.extra_paths: p = Path(extra_path) if not p.is_dir(): raise NotADirectoryError('{} is not a directory'.format(p)) src = str(extra_path) dst = str(project_path / p.name) print_ludwig(f'Copying {src} to {dst}') copy_tree(src, dst) # delete job instructions for worker saved on server (do this before uploader.to_disk() ) for pkl_path in project_path.glob(f'*.pkl'): pkl_path.unlink() if namespace.group is None: random.shuffle(configs.Remote.online_worker_names) workers_cycle = cycle(configs.Remote.online_worker_names) else: workers_cycle = cycle( configs.Remote.group2workers[namespace.group]) print(f'Using workers in group={namespace.group}') # --------------------------------------------------- if namespace.minimal: print_ludwig('Using minimal (debug) parameter configuration') param2val = user_params.param2default.copy() param2val.update(user_params.param2debug) param2val_list = [param2val] else: param2val_list = gen_all_param2vals(user_params.param2requests, user_params.param2default) # iterate over unique jobs num_new = 0 workers_with_jobs = set() for param2val in param2val_list: # make job job = Job(param2val) # get param_name - always try to use an existing param_name even if clear_runs == True, # because this provides constancy in that parameter configurations always receive the same param_name, # which allows for hard-coding param_names in user code. # but hard-coding is not recommended, because constancy is broken when runs are manually cleared. job.update_param_name(runs_path, num_new) # add project_path if namespace.local: job.param2val['project_path'] = str(project_path) elif namespace.isolated: job.param2val['project_path'] = str(cwd) else: job.param2val['project_path'] = str( configs.WorkerDirs.ludwig_data / project_name) # allow exit if requested parameter configuration already exists requested number of times? # do counting with --local, because behavior when --local should be identical to behavior of Ludwig worker if namespace.minimal or namespace.isolated or namespace.clear_runs: num_needed = namespace.reps else: num_needed = job.calc_num_needed(runs_path, namespace.reps) # replicate each job for rep_id in range(num_needed): # add job_name and save_path job.update_job_name_and_save_path(rep_id, namespace.src) # if running locally, execute job now + cleanup if namespace.local or namespace.isolated: series_list = user_job.main(job.param2val) save_job_files(job.param2val, series_list, runs_path) # temporary runs folder auto-created with name = {project_name}_runs must be removed path_tmp = cwd / f'{src_path.name}_runs' if path_tmp.exists(): shutil.rmtree(path_tmp) print(f'Removed temporary directory {path_tmp}') # if running on Ludwig, save worker instructions to shared drive else: worker = namespace.worker or next(workers_cycle) workers_with_jobs.add(worker) uploader = Uploader(project_path, src_path.name, namespace.skip_hostkey) uploader.to_disk(job, worker) num_new += int(job.is_new) if namespace.first_only: raise SystemExit( 'Exiting loop after first job because --first_only=True.') # exit ? if namespace.no_upload: print_ludwig('Flag --upload set to False. Not uploading run.py.') return elif namespace.local or namespace.isolated: return elif not workers_with_jobs: # all requested jobs have previously been completed return # kill running jobs on workers? (do this before removing runs folders). # triggering worker without job instructions kills existing job with matching project_name if namespace.worker is None: workers_for_killing = set( configs.Remote.online_worker_names).difference(workers_with_jobs) else: # if connecting to single worker, only kill jobs on single worker to prevent needing tp access others workers_for_killing = [namespace.worker] for worker in workers_for_killing: uploader.kill_jobs(worker) # delete existing runs on shared drive? if namespace.clear_runs: for param_path in runs_path.glob('*param*'): print_ludwig('Removing\n{}'.format(param_path)) sys.stdout.flush() shutil.rmtree(str(param_path)) # upload = start jobs for worker in workers_with_jobs: uploader.start_jobs(worker) print('Submitted jobs to:') for w in workers_with_jobs: print(w)
def submit(): """ run jobs locally or on Ludwig workers. This script should be called in root directory of the Python project. If not specified via CL arguments, it will try to import src.params. src.params is where this script will try to find the parameters with which to execute your jobs. """ cwd = Path.cwd() project_name = cwd.name # parse cmd-line args parser = argparse.ArgumentParser() parser.add_argument('-src', '--src', default=cwd.name.lower(), action='store', dest='src', required=False, help='Specify path to your source code.') parser.add_argument( '-r', '--reps', default=1, action='store', dest='reps', type=int, choices=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50], required=False, help='Number of times each job will be executed') parser.add_argument( '-m', '--minimal', action='store_true', default=False, dest='minimal', required=False, help='Run minimal parameter configuration for debugging.') parser.add_argument('-l', '--local', action='store_true', default=False, dest='local', required=False, help='Run on host') parser.add_argument( '-i', '--isolated', action='store_true', default=False, dest='isolated', required=False, help='Do not connect to server. Use this only when all daa is available' ) parser.add_argument( '-w', '--worker', default=None, action='store', dest='worker', choices=config.Remote.online_worker_names, required=False, help='Specify a single worker name if submitting to single worker only' ) parser.add_argument('-g', '--group', default=None, action='store', dest='group', choices=config.Remote.group2workers.keys(), required=False, help='Specify a worker group') parser.add_argument( '-x', '--clear_runs', action='store_true', default=False, dest='clear_runs', required=False, help= 'Delete all saved runs associated with current project on shared drive' ) parser.add_argument('-f', '--first_only', action='store_true', default=False, dest='first_only', required=False, help='Run first job and exit.') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) parser.add_argument( '-mnt', '--research_data', default=None, action='store', dest='research_data_path', required=False, help= 'Specify where the shared drive is mounted on your system (if not /media/research_data).' ) parser.add_argument('-e', '--extra_paths', nargs='*', default=[], action='store', dest='extra_paths', required=False, help='Paths to additional Python packages or data. ') parser.add_argument( '-n', '--no-upload', action='store_true', dest='no_upload', required=False, help='Whether to upload jobs to Ludwig. Set false for testing') namespace = parser.parse_args() # ---------------------------------------------- paths if namespace.research_data_path: research_data_path = Path(namespace.research_data_path) else: research_data_path = Path( default_mnt_point) / config.WorkerDirs.research_data.name if namespace.isolated: project_path = cwd else: project_path = research_data_path / project_name runs_path = project_path / 'runs' src_path = cwd / namespace.src if namespace.local or namespace.isolated: pass # TODO remove old parents of save_paths # ------------------------------------------------ user code # import user params + job print_ludwig('Trying to import source code from:\n{}'.format(src_path)) sys.path.append(str(cwd)) user_params = importlib.import_module(src_path.name + '.params') user_job = importlib.import_module(src_path.name + '.job') # ------------------------------------------------ checks if not namespace.isolated: if not os.path.ismount(str(research_data_path)): raise OSError(f'{research_data_path} is not mounted') if not src_path.exists(): raise NotADirectoryError(f'Cannot find source code in {src_path}.') # check that requests are lists and that each list does not contain repeated values for k, v in user_params.param2requests.items(): if not isinstance(v, list): raise TypeError('Values of param2requests must be lists') for vi in v: if isinstance( vi, list ): # tuples can be members of a set (they are hashable) but not lists raise TypeError( 'Inner collections in param2requests must be of type tuple, not list' ) if len(v) != len( set(v) ): # otherwise each identical value will be assigned a unique param_name raise ValueError('Each requested parameter value must be unique') # check that there are no lists (only tuples) in param2default for k, v in user_params.param2default.items(): if isinstance( v, list ): # tuples can be members of a set (they are hashable) but not lists raise TypeError( 'Type list is not allowed in param2default. Convert any lists to tuples.' ) # --------------------------------------------- # are additional source code files required? (do this before killing active jobs) # these can be Python packages, which will be importable, or contain data. # extra_paths is only allowed to be non-empty if not --local for extra_path in namespace.extra_paths: p = Path(extra_path) if not p.is_dir(): raise NotADirectoryError('{} is not a directory'.format(p)) src = str(extra_path) dst = str(project_path / p.name) print_ludwig(f'Copying {src} to {dst}') copy_tree(src, dst) uploader = Uploader(project_path, src_path.name) # delete job instructions for worker saved on server (do this before uploader.to_disk() ) for pkl_path in project_path.glob(f'*.pkl'): pkl_path.unlink() if namespace.group is None: random.shuffle(config.Remote.online_worker_names) workers_cycle = cycle(config.Remote.online_worker_names) else: workers_cycle = cycle(config.Remote.group2workers[namespace.group]) print(f'Using workers in group={namespace.group}') # --------------------------------------------------- if namespace.minimal: print_ludwig('Using minimal (debug) parameter configuration') param2val = user_params.param2default.copy() param2val.update(user_params.param2debug) param2val_list = [param2val] else: param2val_list = gen_all_param2vals(user_params.param2requests, user_params.param2default) # iterate over unique jobs num_new = 0 workers_with_jobs = set() for param2val in param2val_list: # make job job = Job(param2val) job.update_param_name(runs_path, num_new) # multiply job for rep_id in range( job.calc_num_needed(runs_path, namespace.reps, disable=True if (namespace.minimal or namespace.local or namespace.clear_runs) else False)): job.update_job_name(rep_id) # run locally if namespace.local or namespace.isolated: job.param2val['project_path'] = str(project_path) job.param2val['param_name'] += config.Constants.not_ludwig job.param2val['job_name'] += config.Constants.not_ludwig series_list = user_job.main(job.param2val) save_job_files(job.param2val, series_list, runs_path) # upload to Ludwig worker else: job.param2val['project_path'] = str( config.WorkerDirs.research_data / project_name) worker = namespace.worker or next(workers_cycle) workers_with_jobs.add(worker) uploader.to_disk(job, worker) num_new += int(job.is_new) if namespace.first_only: raise SystemExit( 'Exiting loop after first job because --first_only=True.') # upload? if namespace.no_upload: print_ludwig('Flag --upload set to False. Not uploading run.py.') return elif namespace.local and not namespace.minimal: return # kill running jobs on workers? (do this before removing runs folders) # trigger worker without job instructions: kills existing job with matching project_name for worker in set( config.Remote.online_worker_names).difference(workers_with_jobs): uploader.kill_jobs(worker) # delete existing runs? if namespace.clear_runs: for param_path in runs_path.glob('*param*'): print_ludwig('Removing\n{}'.format(param_path)) sys.stdout.flush() shutil.rmtree(str(param_path)) # upload = start jobs for worker in workers_with_jobs: uploader.start_jobs(worker) print('Submitted jobs to:') for w in workers_with_jobs: print(w)