def parse(self) -> None: """Parse the experiment. Parse the Experiment in search of errors that won't allow the experiment to run successfully. If it finds any error, then it raises an ParsingExperimentError. Raises ------ ParsingExperimentError In case a parsing error is found. """ # Check if name is None: if self.name is None or len(self.name) == 0: raise error.ParsingRunnableError( "Experiment should declare a name and it must not be empty" ) # Check if name is valid else: if re.match('^([a-zA-Z0-9]+[_-]*)+$', self.name) is None: raise error.ParsingRunnableError( "Experiment name should contain only alphanumeric characters " + "(with optional - or _ in between)" )
def save_s3(self, force) -> None: """Save an object to s3 using awscli Parameters ---------- force: bool Wheter to use a non-empty bucket folder or not """ url = urlparse(self.destination) if url.scheme != 's3' or url.netloc == '': raise error.ParsingRunnableError( "When uploading to s3, destination should be: " + "s3://<bucket-name>[/path/to/dir]") bucket_name = url.netloc s3 = self.get_boto_session().resource('s3') bucket = s3.Bucket(bucket_name) for content in bucket.objects.all(): path = url.path[1:] # Remove first '/' if content.key.startswith(path) and not force: raise error.ParsingRunnableError( f"Destination {self.destination} is not empty. " + "Use --force to force the usage of this bucket folder or " + "pick another destination.") with tempfile.TemporaryDirectory() as tmpdirname: flambe.save(self.compiled_component, tmpdirname, **self.serialization_args) try: subprocess.check_output( f"aws s3 cp --recursive {tmpdirname} {self.destination}". split(), stderr=subprocess.STDOUT, universal_newlines=True) except subprocess.CalledProcessError as exc: logger.debug(exc.output) raise ValueError(f"Error uploading artifacts to s3. " + "Check logs for more information") else: logger.info(cl.BL(f"Done uploading to {self.destination}"))
def parse(self) -> None: """Parse the experiment. Parse the Experiment in search of errors that won't allow the experiment to run successfully. If it finds any error, then it raises an ParsingExperimentError. Raises ------ ParsingExperimentError In case a parsing error is found. """ # Check if name is None: if self.name is None or len(self.name) == 0: raise error.ParsingRunnableError( "Experiment should declare a name and it must not be empty") # Check if name is valid else: if re.match('^([a-zA-Z0-9]+[_-]*)+$', self.name) is None: raise error.ParsingRunnableError( "Experiment name should contain only alphanumeric characters " + "(with optional - or _ in between)") # Check if resources contains only local and remote if self.resources: if len( list( filter(lambda x: x not in ['local', 'remote'], self.resources.keys()))) > 0: raise error.ParsingRunnableError( f"'resources' section must contain only 'local' section and/or 'remote' keys" ) # Check if local resources exists: if self.resources and self.resources.get("local"): for v in self.resources["local"].values(): if not _is_url(v) and not os.path.exists( os.path.expanduser(v)): raise error.ParsingRunnableError( f"Local resource '{v}' does not exist.")
def save_local(self, force) -> None: """Save an object locally. Parameters ---------- force: bool Wheter to use a non-empty folder or not """ if (os.path.exists(self.destination) and os.listdir(self.destination) and not force): raise error.ParsingRunnableError( f"Destination {self.destination} folder is not empty. " + "Use --force to force the usage of this folder or " + "pick another destination.") flambe.save(self.compiled_component, self.destination, **self.serialization_args)
def first_parse(self) -> Tuple[str, Dict]: """Check if valid YAML file and also load config In this first parse the runnable does not get compiled because it could be a custom Runnable, so it needs the extensions to be imported first. """ if not os.path.exists(self.yaml_file): raise FileNotFoundError( f"Configuration file '{self.yaml_file}' not found. Terminating." ) with open(self.yaml_file, 'r') as f: content = f.read() try: yamls = list(yaml.load_all(content)) except TypeError as e: raise error.ParsingRunnableError( f"Syntax error compiling the runnable: {str(e)}") if len(yamls) > 2: raise ValueError( f"{self.yaml_file} should contain an (optional) extensions sections" + " and the main runnable object.") extensions: Dict[str, str] = {} if len(yamls) == 2: extensions = dict(yamls[0]) # We want self.content to be a string with the raw content # We will precompile later once all extensions are registered. with StringIO() as stream: yaml.dump(yamls[-1], stream) content = stream.getvalue() return content, extensions
def run(self, force: bool = False, verbose: bool = False, debug: bool = False, **kwargs): """Run an Experiment""" logger.info(cl.BL("Launching local experiment")) # Check if save_path/name already exists + is not empty # + force and resume are False if (os.path.exists(self.full_save_path) and os.listdir(self.full_save_path) and not self.resume and not force): raise error.ParsingRunnableError( f"Results from an experiment with the same name were located in the save path " + f"{self.full_save_path}. To overide this results, please use '--force' " + "To use these results and resume the experiment, pick 'resume: True' " + "If not, just pick another save_path/name.") full_save_path = self.full_save_path if not self.env: wording.print_useful_local_info(full_save_path) # If running remotely then all folders were already created. # in the 'setup' method. if not self.env: if os.path.exists(full_save_path) and force: shutil.rmtree(full_save_path) # This deleted the folder also logger.info( cl.RE(f"Removed previous existing from {full_save_path} " + "results as --force was specified")) if not os.path.exists(full_save_path): os.makedirs(full_save_path) logger.debug(f"{full_save_path} created to store output") self._dump_experiment_file() if any( map(lambda x: isinstance(x, ClusterResource), self.resources.values())): raise ValueError( f"Local experiments doesn't support resources with '!cluster' tags. " + "The '!cluster' tag is used for those resources that need to be handled " + "in the cluster when running remote experiments.") if not self.env: self.tmp_resources_dir = tempfile.TemporaryDirectory() resources_folder = self.tmp_resources_dir.name else: resources_folder = f"{self.full_save_path}/_resources" resources = self.process_resources(self.resources, resources_folder) # rsync downloaded resources if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) # Check that links are in order (i.e topologically in pipeline) utils.check_links(self.pipeline, resources) # Check that only computable blocks are given # search algorithms and schedulers utils.check_search(self.pipeline, self.search, self.schedulers) # Initialize ray cluster kwargs = {"logging_level": logging.ERROR, "include_webui": False} if debug: kwargs['local_mode'] = True if self.env: ray.init(redis_address= f"{self.env.orchestrator_ip}:{const.RAY_REDIS_PORT}", **kwargs) else: ray.init(**kwargs) logger.debug(f"Ray cluster up") # Initialize map from block to list of checkpoints # This is used whe resolving links over other computable blocks # TODO: in python 3.7 we can replace these with dict() or {} checkpoints: OrderedDict = OrderedDict() schemas: OrderedDict = OrderedDict() success: OrderedDict = OrderedDict() # By default use all CPUs if no GPU is present devices = self.devices if self.devices else None if devices is None and utils.local_has_gpu(): devices = {"cpu": 4, "gpu": 1} to_resume = None if isinstance(self.resume, str): index = list(self.pipeline.keys()).index(self.resume) to_resume = list(self.pipeline.keys())[:index + 1] elif isinstance(self.resume, Sequence): to_resume = list(self.resume) # Make experiment_tag easier to extract def trial_name_creator(trial): identifier = "" if "env" in trial.config: env = trial.config["env"] if isinstance(env, type): env = env.__name__ identifier += f"{env}" if trial.experiment_tag: hyper_params = {} if "_" in trial.experiment_tag: num, tunable_params = trial.experiment_tag.split("_", 1) identifier += tunable_params param_list = [ p.split("=") for p in tunable_params.split(",") ] hyper_params = {p[0]: p[1] for p in param_list} else: identifier += trial.experiment_tag trial.config['hyper_params'] = hyper_params return identifier.replace("/", "_") trial_name_creator = ray.tune.function(trial_name_creator) # Compute depedencies DAG dependency_dag = {} schemas_dag: OrderedDict = OrderedDict() for block_id, schema_block in self.pipeline.items(): schemas_dag[block_id] = schema_block relevant_ids = utils.extract_needed_blocks(schemas_dag, block_id, resources) dependencies = deepcopy(relevant_ids) dependencies.discard(block_id) dependency_dag[block_id] = list(dependencies) if self.env: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag, self.content, len(self.env.factories_ips)) else: self.progress_state = ProgressState(self.name, full_save_path, dependency_dag, self.content) for block_id, schema_block in tqdm(self.pipeline.items()): schema_block.add_extensions_metadata(self.extensions) logger.debug(f"Starting {block_id}") # Add the block to the configuration so far schemas[block_id] = schema_block success[block_id] = True self.progress_state.checkpoint_start(block_id) relevant_ids = utils.extract_needed_blocks(schemas, block_id, resources) relevant_schemas = { k: v for k, v in deepcopy(schemas).items() if k in relevant_ids } # Set resume resume = False if to_resume is None else (block_id in to_resume) # If computable, convert to tune.Trainable # Each Component block is an Experiment in ray.tune if not isinstance(schema_block, Schema): raise ValueError('schema block not of correct type Schema') if issubclass(schema_block.component_subclass, Component): # Returns is a list non-nested configuration divided_schemas = list( utils.divide_nested_grid_search_options(relevant_schemas)) divided_dict = [utils.extract_dict(x) for x in divided_schemas] # Convert options and links divided_dict_tune = [ utils.convert_tune(x) for x in divided_dict ] # Execute block tune_experiments = [] for param_dict, schemas_dict in zip(divided_dict_tune, divided_schemas): config = { 'name': block_id, 'merge_plot': self.merge_plot, 'params': param_dict, 'schemas': Schema.serialize(schemas_dict), 'checkpoints': checkpoints, 'to_run': block_id, 'global_vars': resources, 'verbose': verbose, 'custom_modules': list(self.extensions.keys()), 'debug': debug } # Filter out the tensorboard logger as we handle # general and tensorboard-specific logging ourselves tune_loggers = list( filter( lambda l: l != tf2_compat_logger and # noqa: E741 not issubclass(l, TFLogger), DEFAULT_LOGGERS)) tune_experiment = ray.tune.Experiment( name=block_id, run=TuneAdapter, trial_name_creator=trial_name_creator, config=deepcopy(config), local_dir=full_save_path, checkpoint_freq=1, checkpoint_at_end=True, max_failures=self.max_failures, resources_per_trial=devices, loggers=tune_loggers) logger.debug(f"Created tune.Experiment for {param_dict}") tune_experiments.append(tune_experiment) trials = ray.tune.run_experiments( tune_experiments, search_alg=self.search.get(block_id, None), scheduler=self.schedulers.get(block_id, None), queue_trials=True, verbose=False, resume=resume, raise_on_failed_trial=False) logger.debug( f"Finish running all tune.Experiments for {block_id}") any_error = False for t in trials: if t.status == t.ERROR: logger.error( cl. RE(f"Variant {t} of '{block_id}' ended with ERROR status." )) success[block_id] = False any_error = True if any_error and self.stop_on_failure: self.teardown() self.progress_state.checkpoint_end(block_id, success[block_id]) raise error.UnsuccessfulRunnableError( f"Stopping experiment at block '{block_id}' " "because there was an error and stop_on_failure == True." ) # Save checkpoint location # It should point from: # block_id -> hash(variant) -> checkpoint hashes = [] for t in trials: schema_with_params: Dict = OrderedDict() for b in schemas_dict: schema_copy = deepcopy(schemas_dict[b]) utils.update_schema_with_params( schema_copy, t.config['params'][b]) schema_with_params[b] = schema_copy hashes.append(repr(schema_with_params)) paths = [t._checkpoint.value for t in trials] # Mask out error trials mask = [True] * len(trials) for i, trial in enumerate(trials): if trial.status == ray.tune.trial.Trial.ERROR: mask[i] = False # Mask out on reduce reduce_k = self.reduce.get(block_id, None) if reduce_k is not None and int(reduce_k) > 0: # Get best best_trials = utils.get_best_trials(trials, topk=int(reduce_k)) best_trial_ids = set([t.trial_id for t in best_trials]) # Mask out for i, trial in enumerate(trials): if trial.trial_id not in best_trial_ids: mask[i] = False trial_checkpoints = { t_hash: path for t_hash, path in zip(hashes, paths) } trial_mask = { t_hash: mask_value for t_hash, mask_value in zip(hashes, mask) } checkpoints[block_id] = { 'paths': trial_checkpoints, 'mask': trial_mask } # Rsync workers to main machine and back to all workers # TODO specify callbacks. If not remote will not work if self.env: run_utils.rsync_hosts(self.env.orchestrator_ip, self.env.factories_ips, self.env.user, self.full_save_path, self.env.key, exclude=["state.pkl"]) self.progress_state.checkpoint_end(block_id, success[block_id]) logger.debug(f"Done running {block_id}") self.teardown() if all(success.values()): logger.info(cl.GR("Experiment ended successfully")) else: raise error.UnsuccessfulRunnableError( "Not all trials were successful. Check the logs for more information" )
def setup(self, cluster: Cluster, extensions: Dict[str, str], force: bool, **kwargs) -> None: """Prepare the cluster for the Experiment remote execution. This involves: 1) [Optional] Kill previous flambe execution 2) [Optional] Remove existing results 3) Create supporting dirs (exp/synced_results, exp/resources) 4) Install extensions in all factories 5) Launch ray cluster 6) Send resources 7) Launch Tensorboard + Report site Parameters ---------- cluster: Cluster The cluster where this Runnable will be running extensions: Dict[str, str] The ClusterRunnable extensions force: bool The force value provided to Flambe """ if self.debug: raise error.ParsingRunnableError( f"Remote experiments don't support debug mode. " + "Remove 'debug: True' for running this experiment in a cluster." ) if cluster.existing_flambe_execution() or cluster.existing_ray_cluster(): if not force: raise man_errors.ClusterError("This cluster is currently used by other " + "experiment. Use --force flag to reuse it. Aborting.") else: cluster.shutdown_flambe_execution() cluster.shutdown_ray_cluster() logger.info(cl.YE("Forced resource to become available...")) output_dir_remote = f"{self.name}/{self.output_folder_name}" if cluster.existing_dir(output_dir_remote): logger.debug("This cluster already ran an experiment " + "with the same name.") if self.resume: logger.info(cl.YE("Resuming previous experiment...")) elif force: cluster.remove_dir(output_dir_remote, content_only=True, all_hosts=True) else: raise man_errors.ClusterError( "This cluster already has results for the same experiment name. " + "If you wish to reuse them, use resume: True or if you want to override them " + "use --force. Aborting." ) cluster.install_extensions_in_factories(extensions) logger.info(cl.YE("Extensions installed in all factories")) # Add redundant check for typing if not cluster.orchestrator: raise man_errors.ClusterError("The orchestrator needs to exist at this point") cluster.create_dirs([self.name, f"{self.name}/{self.output_folder_name}", f"{self.name}/{self.output_folder_name}/_resources"]) logger.info(cl.YE("Created supporting directories")) cluster.launch_ray_cluster() if not cluster.check_ray_cluster(): raise man_errors.ClusterError("Ray cluster not launched correctly.") local_resources = {k: v for k, v in self.resources.items() if not isinstance(v, ClusterResource)} tmp_resources_dir = tempfile.TemporaryDirectory() # This will download remote resources. local_resources = self.process_resources( local_resources, tmp_resources_dir.name) # type: ignore local_resources = cast(Dict[str, str], local_resources) if local_resources: new_resources = cluster.send_local_content( local_resources, os.path.join(cluster.orchestrator.get_home_path(), self.name, self.output_folder_name, "_resources"), all_hosts=True ) else: new_resources = dict() tmp_resources_dir.cleanup() # Add the cluster resources without the tag new_resources.update({k: v.location for k, v in self.resources.items() if isinstance(v, ClusterResource)}) if cluster.orchestrator.is_tensorboard_running(): if force: cluster.orchestrator.remove_tensorboard() else: raise man_errors.ClusterError("Tensorboard was running on the orchestrator.") cluster.orchestrator.launch_tensorboard(output_dir_remote, const.TENSORBOARD_PORT) if cluster.orchestrator.is_report_site_running(): if force: cluster.orchestrator.remove_report_site() else: raise man_errors.ClusterError("Report site was running on the orchestrator") cluster.orchestrator.launch_report_site( f"{output_dir_remote}/state.pkl", port=const.REPORT_SITE_PORT, output_log=f"output.log", output_dir=output_dir_remote, tensorboard_port=const.TENSORBOARD_PORT ) self.set_serializable_attr("resources", new_resources) self.set_serializable_attr("devices", cluster.get_max_resources()) self.set_serializable_attr( "save_path", f"{cluster.orchestrator.get_home_path()}/{self.name}")