def _create_factories(self, number: int = 1) -> List[FactoryInsT]: """Creates new AWS EC2 instances to be the Factory instances. These new machines receive all tags defined in the *.ini file. Factory instances will be named using the factory basename plus an index. For example, "seq2seq_factory_0", "seq2seq_factory_1". Parameters ---------- number : int The number of factories to be created. Returns ------- List[instance.AWSGPUFactoryInstance] The new factory instances. """ if not self.factory_ami: ami = utils._find_default_ami(_type="factory") if ami is None: raise errors.ClusterError("Could not find matching AMI for the factory.") else: ami = self.factory_ami factories = self._generic_launch_instances(instance.CPUFactoryInstance, number, self.factories_type, ami, role="Factory") for i, f in enumerate(factories): f.wait_until_accessible() if f.contains_gpu(): factories[i] = instance.GPUFactoryInstance(f.host, f.private_host, f.username, self.key, self.config, self.debug) return factories
def _existing_cluster(self) -> Tuple[Any, List[Any]]: """Whether there is an existing cluster that matches name. The cluster should also match all other tags, including Creator) Returns ------- Tuple[Any, List[Any]] Returns the (boto_orchestrator, [boto_factories]) that match the experiment's name. """ candidates: List[Tuple[Any, str]] = [] for ins, role, cluster_name in self.flambe_own_running_instances(): if role and cluster_name: if cluster_name == self.name: candidates.append((ins, role)) logger.debug(f"Found existing {role} host {ins.public_ip_address}") orchestrator = None factories = [] for ins, role in candidates: if role == 'Orchestrator': if orchestrator: raise errors.ClusterError( "Found 2 Orchestrator instances with same experiment name. " + "This should never happen. " + "Please remove manually all instances with tag " + f"'Cluster-Name': '{self.name}' and retry." ) orchestrator = ins elif role == 'Factory': factories.append(ins) return orchestrator, factories
def _create_orchestrator(self) -> instance.OrchestratorInstance: """Create a new EC2 instance to be the Orchestrator instance. This new machine receives all tags defined in the *.ini file. Returns ------- instance.AWSOrchestratorInstance The new orchestrator instance. """ if not self.orchestrator_ami: ami = self._find_default_ami(_type="orchestrator") if ami is None: raise errors.ClusterError( "Could not find matching AMI for the orchestrator.") else: ami = self.orchestrator_ami return self._generic_launch_instances(instance.OrchestratorInstance, 1, self.orchestrator_type, ami, role="Orchestrator")[0]
def remove_dir(self, _dir: str, content_only: bool = True, all_hosts: bool = True) -> None: """ Remove a directory in the ClusterError Parameters ---------- _dir: str The directory to remove content_only: bool To remove the content only or the folder also. Defaults to True. all_hosts: bool To remove it in all hosts or only in the Orchestrator. Defaults to True (in all hosts). """ if not self.orchestrator: raise man_errors.ClusterError("Orchestrator instance was not loaded.") if all_hosts: for ins in self._get_all_hosts(): ins.remove_dir(_dir, content_only) else: self.orchestrator.remove_dir(_dir, content_only)
def send_local_content(self, content: Dict[str, str], dest: str, all_hosts: bool = False) -> Dict[str, str]: """Send local content to the cluster Parameters ---------- content: Dict[str, str] The dict of resources key -> local path dest: str The orchestator's destination folder all_hosts: bool If False, only send the content to the orchestrator. If True, send to all factories. Returns ------- Dict[str, str] The new dict of content with orchestrator's paths. """ ret = {} # The orchestator needs to exist at this point if not self.orchestrator: raise man_errors.ClusterError( "Orchestrator instance was not loaded.") for k, c in content.items(): c = os.path.expanduser(c) base: str = "" if os.path.exists(c): size = get_size_MB(c) if size > UPLOAD_WARN_LIMIT_MB: logger.info( cl. YE(f"Uploading '{c}' ({int(size)} MB) which may take a while. " + "Double check you want to be transferring this file " + "(note we automatically sync extensions, experiment resources " + "and potentially the flambe repo if installed in dev mode)" )) if os.path.isdir(c): if not c.endswith(os.sep): c = f"{c}{os.sep}" base = os.path.basename(os.path.dirname(c)) elif os.path.isfile(c): base = os.path.basename(c) new_c = os.path.join(dest, f"{k}__{base}") self.orchestrator.send_rsync(c, new_c) logger.debug(f"Content {k}: {c} sent to cluster") ret[k] = new_c else: ret[k] = c if all_hosts: self.rsync_orch(dest) return ret
def setup(self, cluster: Cluster, extensions: Dict[str, str], force: bool, **kwargs) -> None: """Prepare the cluster for the Experiment remote execution. This involves: 1) [Optional] Kill previous flambe execution 2) [Optional] Remove existing results 3) Create supporting dirs (exp/synced_results, exp/resources) 4) Install extensions in all factories 5) Launch ray cluster 6) Send resources 7) Launch Tensorboard + Report site Parameters ---------- cluster: Cluster The cluster where this Runnable will be running extensions: Dict[str, str] The ClusterRunnable extensions force: bool The force value provided to Flambe """ if cluster.existing_flambe_execution() or cluster.existing_ray_cluster( ): if not force: raise man_errors.ClusterError( "This cluster is currently used by other " + "experiment. Use --force flag to reuse it. Aborting.") else: cluster.shutdown_flambe_execution() cluster.shutdown_ray_cluster() logger.info(cl.YE("Forced resource to become available...")) output_dir_remote = f"{self.name}/{self.output_folder_name}" if cluster.existing_dir(output_dir_remote): logger.debug("This cluster already ran an experiment " + "with the same name.") if self.resume: logger.info(cl.YE("Resuming previous experiment...")) elif force: cluster.remove_dir(output_dir_remote, content_only=True, all_hosts=True) else: raise man_errors.ClusterError( "This cluster already has results for the same experiment name. " + "If you wish to reuse them, use resume: True or if you want to override them " + "use --force. Aborting.") cluster.install_extensions_in_factories(extensions) logger.info(cl.YE("Extensions installed in all factories")) # Add redundant check for typing if not cluster.orchestrator: raise man_errors.ClusterError( "The orchestrator needs to exist at this point") cluster.create_dirs([ self.name, f"{self.name}/{self.output_folder_name}", f"{self.name}/{self.output_folder_name}/_resources" ]) logger.info(cl.YE("Created supporting directories")) cluster.launch_ray_cluster() if not cluster.check_ray_cluster(): raise man_errors.ClusterError( "Ray cluster not launched correctly.") local_resources = { k: v for k, v in self.resources.items() if not isinstance(v, ClusterResource) } tmp_resources_dir = tempfile.TemporaryDirectory() # This will download remote resources. local_resources = self.process_resources( local_resources, tmp_resources_dir.name) # type: ignore local_resources = cast(Dict[str, str], local_resources) if local_resources: new_resources = cluster.send_local_content( local_resources, os.path.join(cluster.orchestrator.get_home_path(), self.name, self.output_folder_name, "_resources"), all_hosts=True) else: new_resources = dict() tmp_resources_dir.cleanup() # Add the cluster resources without the tag new_resources.update({ k: v.location for k, v in self.resources.items() if isinstance(v, ClusterResource) }) if cluster.orchestrator.is_tensorboard_running(): if force: cluster.orchestrator.remove_tensorboard() else: raise man_errors.ClusterError( "Tensorboard was running on the orchestrator.") cluster.orchestrator.launch_tensorboard(output_dir_remote, const.TENSORBOARD_PORT) if cluster.orchestrator.is_report_site_running(): if force: cluster.orchestrator.remove_report_site() else: raise man_errors.ClusterError( "Report site was running on the orchestrator") cluster.orchestrator.launch_report_site( f"{output_dir_remote}/state.pkl", port=const.REPORT_SITE_PORT, output_log=f"output.log", output_dir=output_dir_remote, tensorboard_port=const.TENSORBOARD_PORT) self.set_serializable_attr("resources", new_resources) self.set_serializable_attr("devices", cluster.get_max_resources()) self.set_serializable_attr( "save_path", f"{cluster.orchestrator.get_home_path()}/{self.name}")
def load_all_instances(self) -> None: """Launch all instances for the experiment. This method launches both the orchestrator and the factories. """ boto_orchestrator, boto_factories = self._existing_cluster() with ThreadPoolExecutor() as executor: future_orch, future_factories = None, None if boto_orchestrator: self.orchestrator = self.get_orchestrator( self._get_boto_public_host(boto_orchestrator), self._get_boto_private_host(boto_orchestrator)) logger.info( cl. BL(f"Found existing orchestrator ({boto_orchestrator.instance_type}) " + f"{self.orchestrator.host}")) else: future_orch = executor.submit(self._create_orchestrator) for f in boto_factories: factory = self.get_factory(self._get_boto_public_host(f), self._get_boto_private_host(f)) if factory.contains_gpu(): factory = self.get_gpu_factory( self._get_boto_public_host(f), self._get_boto_private_host(f)) self.factories.append(factory) if len(self.factories) > 0: logger.info( cl.BL(f"Found {len(self.factories)} existing factories " + f"({str([f.host for f in self.factories])}).")) pending_new_factories = self.factories_num - len(self.factories) logger.debug(f"Creating {pending_new_factories} factories") if pending_new_factories > 0: future_factories = executor.submit( self._create_factories, number=pending_new_factories) elif pending_new_factories < 0: logger.info( cl.BL( f"Reusing existing {len(boto_factories)} factories.")) try: if future_orch: self.orchestrator = future_orch.result() logger.info( cl. BL(f"New orchestrator created {self.orchestrator.host}" )) if future_factories: new_factories = future_factories.result() self.factories.extend(new_factories) logger.info( cl. BL(f"{pending_new_factories} factories {self.factories_type} created " + f"({str([f.host for f in new_factories])}).")) except botocore.exceptions.ClientError as e: raise errors.ClusterError( "Error creating the instances. Check that the provided configuration " + f" is correct. Original error: {e}") self.name_hosts() self.update_tags() self.remove_existing_events() self.create_cloudwatch_events()