Beispiel #1
0
    def load_all_instances(self) -> None:
        """Launch all instances for the experiment.

        This method launches both  the orchestrator and the factories.

        """
        boto_orchestrator, boto_factories = self._existing_cluster()

        with ThreadPoolExecutor() as executor:
            future_orch, future_factories = None, None

            if boto_orchestrator:
                self.orchestrator = self.get_orchestrator(boto_orchestrator.public_ip_address,
                                                          boto_orchestrator.private_ip_address)
                logger.info(cl.BL(
                    f"Found existing orchestrator ({boto_orchestrator.instance_type}) " +
                    f"{self.orchestrator.host}"
                ))

            else:
                future_orch = executor.submit(self._create_orchestrator)

            for f in boto_factories:
                factory = self.get_factory(f.public_ip_address, f.private_ip_address)
                if factory.contains_gpu():
                    factory = self.get_gpu_factory(f.public_ip_address, f.private_ip_address)
                self.factories.append(factory)

            if len(self.factories) > 0:
                logger.info(cl.BL(f"Found {len(self.factories)} existing factories " +
                                  f"({str([f.host for f in self.factories])})."))

            pending_new_factories = self.factories_num - len(self.factories)

            logger.debug(f"Creating {pending_new_factories} factories")
            if pending_new_factories > 0:
                future_factories = executor.submit(
                    self._create_factories,
                    number=pending_new_factories
                )
            elif pending_new_factories < 0:
                logger.info(cl.BL(f"Reusing existing {len(boto_factories)} factories."))

            if future_orch:
                self.orchestrator = future_orch.result()
                logger.info(cl.BL(f"New orchestrator created {self.orchestrator.host}"))

            if future_factories:
                new_factories = future_factories.result()
                self.factories.extend(new_factories)
                logger.info(cl.BL(
                    f"{pending_new_factories} factories {self.factories_type} created " +
                    f"({str([f.host for f in new_factories])})."))

        self.name_hosts()
        self.update_tags()
        self.remove_existing_events()
        self.create_cloudwatch_events()
Beispiel #2
0
    def launch_report_site(self, progress_file: str, port: int,
                           output_log: str, output_dir: str,
                           tensorboard_port: int) -> None:
        """Launch the report site.

        The report site is a Flask web app.

        Raises
        ------
        RemoteCommandError
            In case the launch process fails

        """
        tensorboard_url = f"http://{self.host}:{tensorboard_port}"

        cmd = (
            f"tmux new-session -d -s 'flambe-site' 'bash -lc \"flambe-site {progress_file} "
            f"--tensorboard_url {tensorboard_url} "
            f"--host 0.0.0.0 --port {port} "
            f"--output-dir {output_dir} "
            f"--output-log {output_log} &>> outputsite.log\"'")

        res = self._run_cmd(cmd)

        # Sometimes tmux command returns failure (because of some
        # timeout) but website is running.
        # Adding this extra check in that case.
        if res.success and self.is_report_site_running():
            logger.info(cl.BL(f"Report site at http://{self.host}:{port}"))
        else:
            raise errors.RemoteCommandError(
                f"Report site failed to run. {res.msg}")
Beispiel #3
0
    def save_s3(self, force) -> None:
        """Save an object to s3 using awscli

        Parameters
        ----------
        force: bool
            Wheter to use a non-empty bucket folder or not

        """
        url = urlparse(self.destination)

        if url.scheme != 's3' or url.netloc == '':
            raise error.ParsingRunnableError(
                "When uploading to s3, destination should be: " +
                "s3://<bucket-name>[/path/to/dir]")

        bucket_name = url.netloc
        s3 = self.get_boto_session().resource('s3')
        bucket = s3.Bucket(bucket_name)

        for content in bucket.objects.all():
            path = url.path[1:]  # Remove first '/'
            if content.key.startswith(path) and not force:
                raise error.ParsingRunnableError(
                    f"Destination {self.destination} is not empty. " +
                    "Use --force to force the usage of this bucket folder or "
                    + "pick another destination.")

        with tempfile.TemporaryDirectory() as tmpdirname:
            flambe.save(self.compiled_component, tmpdirname,
                        **self.serialization_args)
            try:
                subprocess.check_output(
                    f"aws s3 cp --recursive {tmpdirname} {self.destination}".
                    split(),
                    stderr=subprocess.STDOUT,
                    universal_newlines=True)
            except subprocess.CalledProcessError as exc:
                logger.debug(exc.output)
                raise ValueError(f"Error uploading artifacts to s3. " +
                                 "Check logs for more information")
            else:
                logger.info(cl.BL(f"Done uploading to {self.destination}"))
Beispiel #4
0
    def execute(self,
                cluster_runnable,
                extensions: Dict[str, str],
                new_secrets: str,
                force: bool) -> None:
        """Execute a ClusterRunnable in the cluster.

        It will first upload the runnable file + extensions to the
        orchestrator (under $HOME/flambe.yaml) and then it will
        execute it based on the provided secrets

        Parameters
        ----------
        cluster_runnable: ClusterRunnable
            The ClusterRunnable to run in the cluster
        extensions: Dict[str, str]
            The extensions for the ClusterRunnable
        new_secrets: str
            The path (relative to the orchestrator) where
            the secrets are located.
            IMPORTANT: previous to calling this method, the secrets
            should have been uploaded to the orchestrator
        force: bool
            The force parameter provided when running flambe locally

        """
        if not self.orchestrator:
            raise man_errors.ClusterError("Orchestrator instance was not loaded.")

        orch_exp = (
            f"{self.orchestrator.get_home_path()}/flambe.yaml"
        )

        with tempfile.NamedTemporaryFile("w") as t:
            with StringIO() as s:
                yaml.dump_all([extensions, cluster_runnable], s)
                t.write(s.getvalue())
            t.flush()
            self.orchestrator.send_rsync(t.name, orch_exp)
            logger.info(cl.BL("Remote runnable file sent to orchestrator"))

        self.orchestrator.launch_flambe(orch_exp, new_secrets, force)
Beispiel #5
0
    def run(self,
            force: bool = False,
            verbose: bool = False,
            debug: bool = False,
            **kwargs):
        """Run an Experiment"""

        logger.info(cl.BL("Launching local experiment"))

        # Check if save_path/name already exists + is not empty
        # + force and resume are False
        if (os.path.exists(self.full_save_path)
                and os.listdir(self.full_save_path) and not self.resume
                and not force):
            raise error.ParsingRunnableError(
                f"Results from an experiment with the same name were located in the save path "
                +
                f"{self.full_save_path}. To overide this results, please use '--force' "
                +
                "To use these results and resume the experiment, pick 'resume: True' "
                + "If not, just pick another save_path/name.")

        full_save_path = self.full_save_path

        if not self.env:
            wording.print_useful_local_info(full_save_path)

        # If running remotely then all folders were already created.
        # in the 'setup' method.
        if not self.env:
            if os.path.exists(full_save_path) and force:
                shutil.rmtree(full_save_path)  # This deleted the folder also
                logger.info(
                    cl.RE(f"Removed previous existing from {full_save_path} " +
                          "results as --force was specified"))

            if not os.path.exists(full_save_path):
                os.makedirs(full_save_path)
                logger.debug(f"{full_save_path} created to store output")

        self._dump_experiment_file()

        if any(
                map(lambda x: isinstance(x, ClusterResource),
                    self.resources.values())):
            raise ValueError(
                f"Local experiments doesn't support resources with '!cluster' tags. "
                +
                "The '!cluster' tag is used for those resources that need to be handled "
                + "in the cluster when running remote experiments.")

        if not self.env:
            self.tmp_resources_dir = tempfile.TemporaryDirectory()
            resources_folder = self.tmp_resources_dir.name
        else:
            resources_folder = f"{self.full_save_path}/_resources"

        resources = self.process_resources(self.resources, resources_folder)

        # rsync downloaded resources
        if self.env:
            run_utils.rsync_hosts(self.env.orchestrator_ip,
                                  self.env.factories_ips,
                                  self.env.user,
                                  self.full_save_path,
                                  self.env.key,
                                  exclude=["state.pkl"])

        # Check that links are in order (i.e topologically in pipeline)
        utils.check_links(self.pipeline, resources)

        # Check that only computable blocks are given
        # search algorithms and schedulers
        utils.check_search(self.pipeline, self.search, self.schedulers)

        # Initialize ray cluster
        kwargs = {"logging_level": logging.ERROR, "include_webui": False}
        if debug:
            kwargs['local_mode'] = True

        if self.env:
            ray.init(redis_address=
                     f"{self.env.orchestrator_ip}:{const.RAY_REDIS_PORT}",
                     **kwargs)
        else:
            ray.init(**kwargs)
            logger.debug(f"Ray cluster up")

        # Initialize map from block to list of checkpoints
        # This is used whe resolving links over other computable blocks
        # TODO: in python 3.7 we can replace these with dict() or {}
        checkpoints: OrderedDict = OrderedDict()
        schemas: OrderedDict = OrderedDict()
        success: OrderedDict = OrderedDict()

        # By default use all CPUs if no GPU is present
        devices = self.devices if self.devices else None
        if devices is None and utils.local_has_gpu():
            devices = {"cpu": 4, "gpu": 1}

        to_resume = None
        if isinstance(self.resume, str):
            index = list(self.pipeline.keys()).index(self.resume)
            to_resume = list(self.pipeline.keys())[:index + 1]
        elif isinstance(self.resume, Sequence):
            to_resume = list(self.resume)

        # Make experiment_tag easier to extract
        def trial_name_creator(trial):
            identifier = ""
            if "env" in trial.config:
                env = trial.config["env"]
                if isinstance(env, type):
                    env = env.__name__
                identifier += f"{env}"
            if trial.experiment_tag:
                hyper_params = {}
                if "_" in trial.experiment_tag:
                    num, tunable_params = trial.experiment_tag.split("_", 1)
                    identifier += tunable_params
                    param_list = [
                        p.split("=") for p in tunable_params.split(",")
                    ]
                    hyper_params = {p[0]: p[1] for p in param_list}
                else:
                    identifier += trial.experiment_tag
                trial.config['hyper_params'] = hyper_params
            return identifier.replace("/", "_")

        trial_name_creator = ray.tune.function(trial_name_creator)

        # Compute depedencies DAG
        dependency_dag = {}
        schemas_dag: OrderedDict = OrderedDict()
        for block_id, schema_block in self.pipeline.items():
            schemas_dag[block_id] = schema_block
            relevant_ids = utils.extract_needed_blocks(schemas_dag, block_id,
                                                       resources)
            dependencies = deepcopy(relevant_ids)
            dependencies.discard(block_id)

            dependency_dag[block_id] = list(dependencies)

        if self.env:
            self.progress_state = ProgressState(self.name, full_save_path,
                                                dependency_dag, self.content,
                                                len(self.env.factories_ips))
        else:
            self.progress_state = ProgressState(self.name, full_save_path,
                                                dependency_dag, self.content)

        for block_id, schema_block in tqdm(self.pipeline.items()):
            schema_block.add_extensions_metadata(self.extensions)
            logger.debug(f"Starting {block_id}")

            # Add the block to the configuration so far
            schemas[block_id] = schema_block
            success[block_id] = True

            self.progress_state.checkpoint_start(block_id)
            relevant_ids = utils.extract_needed_blocks(schemas, block_id,
                                                       resources)
            relevant_schemas = {
                k: v
                for k, v in deepcopy(schemas).items() if k in relevant_ids
            }

            # Set resume
            resume = False if to_resume is None else (block_id in to_resume)

            # If computable, convert to tune.Trainable
            # Each Component block is an Experiment in ray.tune
            if not isinstance(schema_block, Schema):
                raise ValueError('schema block not of correct type Schema')
            if issubclass(schema_block.component_subclass, Component):

                # Returns is a list non-nested configuration
                divided_schemas = list(
                    utils.divide_nested_grid_search_options(relevant_schemas))
                divided_dict = [utils.extract_dict(x) for x in divided_schemas]
                # Convert options and links
                divided_dict_tune = [
                    utils.convert_tune(x) for x in divided_dict
                ]
                # Execute block
                tune_experiments = []
                for param_dict, schemas_dict in zip(divided_dict_tune,
                                                    divided_schemas):
                    config = {
                        'name': block_id,
                        'merge_plot': self.merge_plot,
                        'params': param_dict,
                        'schemas': Schema.serialize(schemas_dict),
                        'checkpoints': checkpoints,
                        'to_run': block_id,
                        'global_vars': resources,
                        'verbose': verbose,
                        'custom_modules': list(self.extensions.keys()),
                        'debug': debug
                    }
                    # Filter out the tensorboard logger as we handle
                    # general and tensorboard-specific logging ourselves
                    tune_loggers = list(
                        filter(
                            lambda l: l != tf2_compat_logger and  # noqa: E741
                            not issubclass(l, TFLogger),
                            DEFAULT_LOGGERS))
                    tune_experiment = ray.tune.Experiment(
                        name=block_id,
                        run=TuneAdapter,
                        trial_name_creator=trial_name_creator,
                        config=deepcopy(config),
                        local_dir=full_save_path,
                        checkpoint_freq=1,
                        checkpoint_at_end=True,
                        max_failures=self.max_failures,
                        resources_per_trial=devices,
                        loggers=tune_loggers)
                    logger.debug(f"Created tune.Experiment for {param_dict}")
                    tune_experiments.append(tune_experiment)

                trials = ray.tune.run_experiments(
                    tune_experiments,
                    search_alg=self.search.get(block_id, None),
                    scheduler=self.schedulers.get(block_id, None),
                    queue_trials=True,
                    verbose=False,
                    resume=resume,
                    raise_on_failed_trial=False)
                logger.debug(
                    f"Finish running all tune.Experiments for {block_id}")

                any_error = False
                for t in trials:
                    if t.status == t.ERROR:
                        logger.error(
                            cl.
                            RE(f"Variant {t} of '{block_id}' ended with ERROR status."
                               ))
                        success[block_id] = False
                        any_error = True
                if any_error and self.stop_on_failure:
                    self.teardown()
                    self.progress_state.checkpoint_end(block_id,
                                                       success[block_id])
                    raise error.UnsuccessfulRunnableError(
                        f"Stopping experiment at block '{block_id}' "
                        "because there was an error and stop_on_failure == True."
                    )

                # Save checkpoint location
                # It should point from:
                # block_id -> hash(variant) -> checkpoint
                hashes = []
                for t in trials:
                    schema_with_params: Dict = OrderedDict()
                    for b in schemas_dict:
                        schema_copy = deepcopy(schemas_dict[b])
                        utils.update_schema_with_params(
                            schema_copy, t.config['params'][b])
                        schema_with_params[b] = schema_copy
                    hashes.append(repr(schema_with_params))

                paths = [t._checkpoint.value for t in trials]

                # Mask out error trials
                mask = [True] * len(trials)
                for i, trial in enumerate(trials):
                    if trial.status == ray.tune.trial.Trial.ERROR:
                        mask[i] = False

                # Mask out on reduce
                reduce_k = self.reduce.get(block_id, None)
                if reduce_k is not None and int(reduce_k) > 0:
                    # Get best
                    best_trials = utils.get_best_trials(trials,
                                                        topk=int(reduce_k))
                    best_trial_ids = set([t.trial_id for t in best_trials])
                    # Mask out
                    for i, trial in enumerate(trials):
                        if trial.trial_id not in best_trial_ids:
                            mask[i] = False

                trial_checkpoints = {
                    t_hash: path
                    for t_hash, path in zip(hashes, paths)
                }
                trial_mask = {
                    t_hash: mask_value
                    for t_hash, mask_value in zip(hashes, mask)
                }
                checkpoints[block_id] = {
                    'paths': trial_checkpoints,
                    'mask': trial_mask
                }

                # Rsync workers to main machine and back to all workers
                # TODO specify callbacks. If not remote will not work
                if self.env:
                    run_utils.rsync_hosts(self.env.orchestrator_ip,
                                          self.env.factories_ips,
                                          self.env.user,
                                          self.full_save_path,
                                          self.env.key,
                                          exclude=["state.pkl"])

            self.progress_state.checkpoint_end(block_id, success[block_id])
            logger.debug(f"Done running {block_id}")

        self.teardown()

        if all(success.values()):
            logger.info(cl.GR("Experiment ended successfully"))
        else:
            raise error.UnsuccessfulRunnableError(
                "Not all trials were successful. Check the logs for more information"
            )
Beispiel #6
0
    def load_all_instances(self) -> None:
        """Launch all instances for the experiment.

        This method launches both  the orchestrator and the factories.

        """
        boto_orchestrator, boto_factories = self._existing_cluster()

        with ThreadPoolExecutor() as executor:
            future_orch, future_factories = None, None

            if boto_orchestrator:
                self.orchestrator = self.get_orchestrator(
                    self._get_boto_public_host(boto_orchestrator),
                    self._get_boto_private_host(boto_orchestrator))
                logger.info(
                    cl.
                    BL(f"Found existing orchestrator ({boto_orchestrator.instance_type}) "
                       + f"{self.orchestrator.host}"))

            else:
                future_orch = executor.submit(self._create_orchestrator)

            for f in boto_factories:
                factory = self.get_factory(self._get_boto_public_host(f),
                                           self._get_boto_private_host(f))
                if factory.contains_gpu():
                    factory = self.get_gpu_factory(
                        self._get_boto_public_host(f),
                        self._get_boto_private_host(f))
                self.factories.append(factory)

            if len(self.factories) > 0:
                logger.info(
                    cl.BL(f"Found {len(self.factories)} existing factories " +
                          f"({str([f.host for f in self.factories])})."))

            pending_new_factories = self.factories_num - len(self.factories)

            logger.debug(f"Creating {pending_new_factories} factories")
            if pending_new_factories > 0:
                future_factories = executor.submit(
                    self._create_factories, number=pending_new_factories)
            elif pending_new_factories < 0:
                logger.info(
                    cl.BL(
                        f"Reusing existing {len(boto_factories)} factories."))

            try:
                if future_orch:
                    self.orchestrator = future_orch.result()
                    logger.info(
                        cl.
                        BL(f"New orchestrator created {self.orchestrator.host}"
                           ))

                if future_factories:
                    new_factories = future_factories.result()
                    self.factories.extend(new_factories)
                    logger.info(
                        cl.
                        BL(f"{pending_new_factories} factories {self.factories_type} created "
                           + f"({str([f.host for f in new_factories])})."))
            except botocore.exceptions.ClientError as e:
                raise errors.ClusterError(
                    "Error creating the instances. Check that the provided configuration "
                    + f" is correct. Original error: {e}")

        self.name_hosts()
        self.update_tags()
        self.remove_existing_events()
        self.create_cloudwatch_events()
Beispiel #7
0
def main(args: argparse.Namespace) -> None:
    """Execute command based on given config"""
    if is_dev_mode():
        print(cl.RA(ASCII_LOGO_DEV))
        print(cl.BL(f"Location: {get_flambe_repo_location()}\n"))
    else:
        print(cl.RA(ASCII_LOGO))
        print(cl.BL(f"VERSION: {flambe.__version__}\n"))

    # Pass original module for ray / pickle
    make_component(torch.nn.Module, TORCH_TAG_PREFIX, only_module='torch.nn')
    # torch.optim.Optimizer exists, ignore mypy
    make_component(
        torch.optim.Optimizer,
        TORCH_TAG_PREFIX,  # type: ignore
        only_module='torch.optim')
    make_component(torch.optim.lr_scheduler._LRScheduler,
                   TORCH_TAG_PREFIX,
                   only_module='torch.optim.lr_scheduler')
    make_component(ray.tune.schedulers.TrialScheduler, TUNE_TAG_PREFIX)
    make_component(ray.tune.suggest.SearchAlgorithm, TUNE_TAG_PREFIX)

    # TODO check first if there is cluster as if there is there
    # is no need to install extensions
    check_system_reqs()
    with SafeExecutionContext(args.config) as ex:
        if args.cluster is not None:
            with SafeExecutionContext(args.cluster) as ex_cluster:
                cluster, _ = ex_cluster.preprocess(
                    secrets=args.secrets, install_ext=args.install_extensions)
                runnable, extensions = ex.preprocess(import_ext=False,
                                                     secrets=args.secrets)
                cluster.run(force=args.force)
                if isinstance(runnable, ClusterRunnable):
                    cluster = cast(Cluster, cluster)

                    # This is independant to the type of ClusterRunnable
                    destiny = os.path.join(cluster.get_orch_home_path(),
                                           "extensions")

                    # Before sending the extensions, they need to be
                    # downloaded (locally).
                    t = os.path.join(FLAMBE_GLOBAL_FOLDER, "extensions")
                    extensions = download_extensions(extensions, t)

                    # At this point, all remote extensions
                    # (except pypi extensions)
                    # have local paths.
                    new_extensions = cluster.send_local_content(extensions,
                                                                destiny,
                                                                all_hosts=True)

                    new_secrets = cluster.send_secrets()

                    # Installing the extensions is crutial as flambe
                    # will execute without '-i' flag and therefore
                    # will assume that the extensions are installed
                    # in the orchestrator.
                    cluster.install_extensions_in_orchestrator(new_extensions)
                    logger.info(cl.GR("Extensions installed in Orchestrator"))

                    runnable.setup_inject_env(cluster=cluster,
                                              extensions=new_extensions,
                                              force=args.force)
                    cluster.execute(runnable, new_extensions, new_secrets,
                                    args.force)
                else:
                    raise ValueError(
                        "Only ClusterRunnables can be executed in a cluster.")
        else:
            runnable, _ = ex.preprocess(secrets=args.secrets,
                                        install_ext=args.install_extensions)
            runnable.run(force=args.force, verbose=args.verbose)