def run(self, force: bool = False, verbose: bool = False, debug: bool = False, **kwargs): """Run an Experiment""" logger.info(cl.BL("Launching local experiment")) # Check if save_path/name already exists + is not empty # + force and resume are False if (os.path.exists(self.full_save_path) and os.listdir(self.full_save_path) and not self.resume and not force): raise error.ParsingRunnableError( f"Results from an experiment with the same name were located in the save path " + f"{self.full_save_path}. To overide this results, please use '--force' " + "To use these results and resume the experiment, pick 'resume: True' " + "If not, just pick another save_path/name.") full_save_path = self.full_save_path if not self.env: wording.print_useful_local_info(full_save_path) # If running remotely then all folders were already created. # in the 'setup' method. if not self.env: if os.path.exists(full_save_path) and force: shutil.rmtree(full_save_path) # This deleted the folder also logger.info( cl.RE(f"Removed previous existing from {full_save_path} " + "results as --force was specified")) if not os.path.exists(full_save_path): os.makedirs(full_save_path) logger.debug(f"{full_save_path} created to store output") self._dump_experiment_file() if any( map(lambda x: isinstance(x, ClusterResource), self.resources.values())): raise ValueError( f"Local experiments doesn't support resources with '!cluster' tags. " + "The '!cluster' tag is used for those resources that need to be handled " + "in the cluster when running remote experiments.") if not self.env: self.tmp_resources_dir = tempfile.TemporaryDirectory() resources_folder = self.tmp_resources_dir.name else: resources_folder = f"{self.full_save_path}/_resources" resources = self.process_resources(self.resources, resources_folder) # rsync downloaded resources if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) # Check that links are in order (i.e topologically in pipeline) utils.check_links(self.pipeline, resources) # Check that only computable blocks are given # search algorithms and schedulers utils.check_search(self.pipeline, self.search, self.schedulers) # Initialize ray cluster kwargs = {"logging_level": logging.ERROR, "include_webui": False} if debug: kwargs['local_mode'] = True if self.env: ray.init(redis_address= f"{self.env.orchestrator_ip}:{const.RAY_REDIS_PORT}", **kwargs) else: ray.init(**kwargs) logger.debug(f"Ray cluster up") # Initialize map from block to list of checkpoints # This is used whe resolving links over other computable blocks # TODO: in python 3.7 we can replace these with dict() or {} checkpoints: OrderedDict = OrderedDict() schemas: OrderedDict = OrderedDict() success: OrderedDict = OrderedDict() # By default use all CPUs if no GPU is present devices = self.devices if self.devices else None if devices is None and utils.local_has_gpu(): devices = {"cpu": 4, "gpu": 1} to_resume = None if isinstance(self.resume, str): index = list(self.pipeline.keys()).index(self.resume) to_resume = list(self.pipeline.keys())[:index + 1] elif isinstance(self.resume, Sequence): to_resume = list(self.resume) # Make experiment_tag easier to extract def trial_name_creator(trial): identifier = "" if "env" in trial.config: env = trial.config["env"] if isinstance(env, type): env = env.__name__ identifier += f"{env}" if trial.experiment_tag: hyper_params = {} if "_" in trial.experiment_tag: num, tunable_params = trial.experiment_tag.split("_", 1) identifier += tunable_params param_list = [ p.split("=") for p in tunable_params.split(",") ] hyper_params = {p[0]: p[1] for p in param_list} else: identifier += trial.experiment_tag trial.config['hyper_params'] = hyper_params return identifier.replace("/", "_") trial_name_creator = ray.tune.function(trial_name_creator) # Compute depedencies DAG dependency_dag = {} schemas_dag: OrderedDict = OrderedDict() for block_id, schema_block in self.pipeline.items(): schemas_dag[block_id] = schema_block relevant_ids = utils.extract_needed_blocks(schemas_dag, block_id, resources) dependencies = deepcopy(relevant_ids) dependencies.discard(block_id) dependency_dag[block_id] = list(dependencies) if self.env: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag, self.content, len(self.env.factories_ips)) else: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag, self.content) for block_id, schema_block in tqdm(self.pipeline.items()): schema_block.add_extensions_metadata(self.extensions) logger.debug(f"Starting {block_id}") # Add the block to the configuration so far schemas[block_id] = schema_block success[block_id] = True self.progress_state.checkpoint_start(block_id) relevant_ids = utils.extract_needed_blocks(schemas, block_id, resources) relevant_schemas = { k: v for k, v in deepcopy(schemas).items() if k in relevant_ids } # Set resume resume = False if to_resume is None else (block_id in to_resume) # If computable, convert to tune.Trainable # Each Component block is an Experiment in ray.tune if not isinstance(schema_block, Schema): raise ValueError('schema block not of correct type Schema') if issubclass(schema_block.component_subclass, Component): # Returns is a list non-nested configuration divided_schemas = list( utils.divide_nested_grid_search_options(relevant_schemas)) divided_dict = [utils.extract_dict(x) for x in divided_schemas] # Convert options and links divided_dict_tune = [ utils.convert_tune(x) for x in divided_dict ] # Execute block tune_experiments = [] for param_dict, schemas_dict in zip(divided_dict_tune, divided_schemas): config = { 'name': block_id, 'merge_plot': self.merge_plot, 'params': param_dict, 'schemas': Schema.serialize(schemas_dict), 'checkpoints': checkpoints, 'to_run': block_id, 'global_vars': resources, 'verbose': verbose, 'custom_modules': list(self.extensions.keys()), 'debug': debug } # Filter out the tensorboard logger as we handle # general and tensorboard-specific logging ourselves tune_loggers = list( filter( lambda l: l != tf2_compat_logger and # noqa: E741 not issubclass(l, TFLogger), DEFAULT_LOGGERS)) tune_experiment = ray.tune.Experiment( name=block_id, run=TuneAdapter, trial_name_creator=trial_name_creator, config=deepcopy(config), local_dir=full_save_path, checkpoint_freq=1, checkpoint_at_end=True, max_failures=self.max_failures, resources_per_trial=devices, loggers=tune_loggers) logger.debug(f"Created tune.Experiment for {param_dict}") tune_experiments.append(tune_experiment) trials = ray.tune.run_experiments( tune_experiments, search_alg=self.search.get(block_id, None), scheduler=self.schedulers.get(block_id, None), queue_trials=True, verbose=False, resume=resume, raise_on_failed_trial=False) logger.debug( f"Finish running all tune.Experiments for {block_id}") any_error = False for t in trials: if t.status == t.ERROR: logger.error( cl. RE(f"Variant {t} of '{block_id}' ended with ERROR status." )) success[block_id] = False any_error = True if any_error and self.stop_on_failure: self.teardown() self.progress_state.checkpoint_end(block_id, success[block_id]) raise error.UnsuccessfulRunnableError( f"Stopping experiment at block '{block_id}' " "because there was an error and stop_on_failure == True." ) # Save checkpoint location # It should point from: # block_id -> hash(variant) -> checkpoint hashes = [] for t in trials: schema_with_params: Dict = OrderedDict() for b in schemas_dict: schema_copy = deepcopy(schemas_dict[b]) utils.update_schema_with_params( schema_copy, t.config['params'][b]) schema_with_params[b] = schema_copy hashes.append(repr(schema_with_params)) paths = [t._checkpoint.value for t in trials] # Mask out error trials mask = [True] * len(trials) for i, trial in enumerate(trials): if trial.status == ray.tune.trial.Trial.ERROR: mask[i] = False # Mask out on reduce reduce_k = self.reduce.get(block_id, None) if reduce_k is not None and int(reduce_k) > 0: # Get best best_trials = utils.get_best_trials(trials, topk=int(reduce_k)) best_trial_ids = set([t.trial_id for t in best_trials]) # Mask out for i, trial in enumerate(trials): if trial.trial_id not in best_trial_ids: mask[i] = False trial_checkpoints = { t_hash: path for t_hash, path in zip(hashes, paths) } trial_mask = { t_hash: mask_value for t_hash, mask_value in zip(hashes, mask) } checkpoints[block_id] = { 'paths': trial_checkpoints, 'mask': trial_mask } # Rsync workers to main machine and back to all workers # TODO specify callbacks. If not remote will not work if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) self.progress_state.checkpoint_end(block_id, success[block_id]) logger.debug(f"Done running {block_id}") self.teardown() if all(success.values()): logger.info(cl.GR("Experiment ended successfully")) else: raise error.UnsuccessfulRunnableError( "Not all trials were successful. Check the logs for more information" )
def _setup(self, config: Dict): """Subclasses should override this for custom initialization.""" # Set this flag to False, if we find an error, or reduce self.name = config['name'] self.run_flag = True custom_modules = config['custom_modules'] setup_default_modules() import_modules(custom_modules) # Get the current computation block target_block_id = config['to_run'] self.block_id = target_block_id # Update the schemas with the configuration schemas: Dict[str, Schema] = Schema.deserialize(config['schemas']) schemas_copy = deepcopy(schemas) global_vars = config['global_vars'] self.verbose = config['verbose'] self.hyper_params = config['hyper_params'] self.debug = config['debug'] with TrialLogging(log_dir=self.logdir, verbose=self.verbose, console_prefix=self.block_id, hyper_params=self.hyper_params, capture_warnings=True): # Compile, activate links, and load checkpoints filled_schemas: Dict = OrderedDict() for block_id, schema_block in schemas.items(): block_params = config['params'][block_id] utils.update_schema_with_params(schemas_copy[block_id], block_params) # First activate links from previous blocks in the # pipeline utils.update_link_refs(schemas_copy, block_id, global_vars) block: Component = schemas_copy[block_id]() filled_schemas[block_id] = schemas_copy[block_id] if block_id in config['checkpoints']: # Get the block hash needed_set = utils.extract_needed_blocks( schemas, block_id, global_vars) needed_blocks = ((k, v) for k, v in filled_schemas.items() if k in needed_set) block_hash = repr(OrderedDict(needed_blocks)) # Check the mask, if it's False then we end # immediately mask_value = config['checkpoints'][block_id]['mask'][ block_hash] if mask_value is False: self.run_flag = False return # There should be a checkpoint checkpoint = config['checkpoints'][block_id]['paths'][ block_hash] state = load_state_from_file(checkpoint) block.load_state(state) # Holding compiled objects alongside schemas is okay # but not fully expressed in our type annotations. # TODO: fix this in our utils type annotations schemas_copy[block_id] = block # type: ignore # If everything went well, just compile self.block = schemas_copy[target_block_id] # Add tb prefix to computables in case multiple plots are # requested if not config['merge_plot']: self.block.tb_log_prefix = self.name