Example #1
0
    def shutdown_compute(self, cloud, args, task_arn):
        """
        Close compute: terminate and then if running on AWS ECS, stop the task.
        """

        print("\n....... Shutdown System")

        self.terminate()

        # Note that container should be set up to terminate once compute has
        # been terminated. However, it doesn't hurt to clean up by making sure
        # that the container is killed

        # if ecs, then stop the task
        if args.remote_type == "aws" and (task_arn is not None):
            cloud.ecs_stop_task(task_arn)

        # if it's remote and docker, then kill container
        if cloud and self.remote():
            # ensure we have the container id
            if self.container_id:
                utils.remote_run(self.host_node,
                                 'docker stop ' + self.container_id)
            else:
                logging.warning("Docker did not shut down, could not " +
                                "locate container id")
        elif not args.no_docker:
            # stops local docker
            utils.docker_stop()
  def _exec_experiment(self, host_node, experiment_id, experiment_prefix,
                       config_json, param_sweeps=None):
    utils.remote_run(
        host_node,
        self._run_command(host_node, experiment_id, experiment_prefix,
                          config_json, param_sweeps))

    if self.export:
      utils.remote_run(
          host_node,
          self._upload_command(host_node, experiment_id, experiment_prefix))
Example #3
0
    def _create_experiment(self, host_node):
        """Creates new MLFlow experiment remotely."""
        experiment_prefix = datetime.datetime.now().strftime('%y%m%d-%H%M')

        if self.use_docker:
            # pylint: disable=anomalous-backslash-in-string
            command = '''
        docker exec -it {docker_id} bash -c "
          source activate {anaenv}

          export LC_ALL=C.UTF-8
          export LANG=C.UTF-8
          export RUN_DIR=$HOME/agief-remote-run

          pip install -e \$RUN_DIR/pagi --quiet
          pagi --help
          pip install -e \$RUN_DIR/rsm --quiet
          pip install -e \$RUN_DIR/{project} --quiet

          cd \$RUN_DIR/{project}
          mlflow experiments create {prefix}
        "
      '''.format(anaenv='tensorflow',
                 prefix=experiment_prefix,
                 docker_id=self.docker_id,
                 project=self.project)
        else:
            command = '''
        source {remote_env} {anaenv}

        export RUN_DIR=$HOME/agief-remote-run

        pip install -e $RUN_DIR/pagi --quiet
        pip install -e $RUN_DIR/{project} --quiet

        cd $RUN_DIR/{project}
        mlflow experiments create {prefix}
      '''.format(anaenv='tensorflow',
                 remote_env=host_node.remote_env_path,
                 prefix=experiment_prefix,
                 project=self.project)

        remote_output = utils.remote_run(host_node, command)
        command_output = [
            s for s in remote_output if 'Created experiment' in s
        ]
        command_output = command_output[0].strip().split(' ')
        experiment_id = int(command_output[-1])

        return experiment_id, experiment_prefix
    def remote_docker_launch_compute(self, host_node):
        """
        Assumes there exists a private key for the given
        ec2 instance, at keypath
        """

        print("\n....... Launch compute node in a docker container "
              "on a remote host.")

        commands = '''
            export VARIABLES_FILE={0}
            source {0}
            cd $AGI_HOME/bin/node_coordinator
            ./run-in-docker.sh -d
        '''.format(host_node.remote_variables_file)

        return utils.remote_run(host_node, commands)
Example #5
0
    def _launch_docker(self, host_node):
        """Launch the Docker container on the remote machine."""
        assert self.docker_image is not None, 'Docker image not provided.'

        command = '''
      export RUN_DIR=$HOME/agief-remote-run

      docker pull {docker_image}
      docker run -d -t --runtime=nvidia --ipc=host --mount type=bind,source=$RUN_DIR,target=$RUN_DIR \
        {docker_image} bash
    '''.format(docker_image=self.docker_image)

        remote_output = utils.remote_run(host_node, command)
        command_output = [s for s in remote_output]
        command_output = command_output[-1].strip()
        self.docker_id = command_output

        return command_output
    def run_sweeps(self, compute_node, cloud, args):
        """
        Perform parameter sweep steps, and run experiment for each step.
        """

        print("\n........ Run Sweeps")

        exps_filename = self.experiment_utils.experiment_def_file()

        if not os.path.exists(exps_filename):
            msg = "Experiment file does not exist at: " + exps_filename
            raise Exception(msg)

        # Silently remove older log file if exists
        log_filepath = self.experiment_utils.runpath(self.LOG_FILENAME)
        if compute_node.remote():
            utils.remote_run(compute_node.host_node, 'rm ' + log_filepath)
        else:
            utils.remove_file(log_filepath, True)

        with open(exps_filename) as exps_file:
            filedata = json.load(exps_file)

        for exp_i in filedata['experiments']:
            import_files = exp_i['import-files']  # import files dictionary

            logging.debug("Import Files Dictionary = \n" +
                          json.dumps(import_files, indent=4))

            base_entity_filename = import_files['file-entities']
            base_data_filenames = import_files['file-data']

            exp_ll_data_filepaths = []
            if 'load-local-files' in exp_i:
                load_local_files = exp_i['load-local-files']
                if 'file-data' in load_local_files:
                    exp_ll_data_filepaths = list(
                        map(self.experiment_utils.runpath,
                            load_local_files['file-data'])
                    )

            run_parameterset_partial = (
                functools.partial(
                    self.run_parameterset,
                    compute_node=compute_node,
                    cloud=cloud,
                    args=args,
                    compute_data_filepaths=exp_ll_data_filepaths)
            )

            if 'parameter-sweeps' not in exp_i or (
                    len(exp_i['parameter-sweeps']) == 0):
                print("No parameters to sweep, just run once.")
                exp_entity_filepath, exp_data_filepaths = (
                    self.create_all_input_files(base_entity_filename,
                                                base_data_filenames)
                )
                run_parameterset_partial(entity_filepath=exp_entity_filepath,
                                         data_filepaths=exp_data_filepaths)
            else:
                # array of sweep definitions
                for param_sweep in exp_i['parameter-sweeps']:
                    counters = self.setup_parameter_sweepers(param_sweep)
                    while True:
                        exp_entity_filepath, exp_data_filepaths = (
                            self.create_all_input_files(
                                base_entity_filename,
                                base_data_filenames)
                        )
                        reset, sweep_param_vals = self.inc_parameter_set(
                            compute_node, args,
                            exp_entity_filepath,
                            counters
                        )
                        if reset:
                            break
                        run_parameterset_partial(
                            entity_filepath=exp_entity_filepath,
                            data_filepaths=exp_data_filepaths,
                            sweep_param_vals=sweep_param_vals
                        )
Example #7
0
def main():
    """
    The main scope of the run-framework containing the high level code
    """

    print("------------------------------------------")
    print("----          run-framework           ----")
    print("------------------------------------------")

    # Record experiment start time
    exp_start_time = datetime.now()

    args = setup_arg_parsing()

    # setup logging
    log_format = ("[%(filename)s:%(lineno)s - %(funcName)s() " +
                  "- %(levelname)s] %(message)s")
    logging.basicConfig(format=log_format,
                        level=utils.logger_level(args.logging))

    logging.debug("Python Version: " + sys.version)
    logging.debug("Arguments: %s", args)

    exps_file = args.exps_file if args.exps_file else ""
    experiment = Experiment(args.debug_no_run, LaunchMode.from_args(args),
                            exps_file, args.no_compress, args.csv_output)

    # 1) Generate input files
    if args.main_class:
        compute_node = Compute(host_node=HostNode(), port=args.port)
        compute_node.launch(experiment, main_class=args.main_class,
                            no_local_docker=args.no_docker)
        experiment.generate_input_files_locally(compute_node)
        compute_node.terminate()
        return

    # *) all other use cases (non Generate input files)

    cloud = Cloud()

    if args.upload and not (args.export or args.export_compute):
        logging.warning("Uploading experiment to S3 is enabled, but " +
                        "'export experiment' is not, so the most important " +
                        "files (output entity.json and data.json) " +
                        "will be missing")

    if args.remote_type != "local":
        host_node = HostNode(args.host, args.user, args.ssh_keypath,
                             args.remote_variables_file, args.ssh_port)
    else:
        host_node = HostNode(args.host, args.user)

    compute_node = Compute(host_node, args.port)

    check_args(args, compute_node)

    # 2) Setup infrastructure (on AWS or nothing to do locally)
    ips = {'ip_public': args.host, 'ip_private': None}
    ips_pg = {'ip_public': None, 'ip_private': None}
    instance_id = None

    is_pg_ec2 = args.pg_instance and args.pg_instance[:2] == 'i-'
    if args.remote_type == "aws":
        # start Compute ec2 either from instanceid or amiid
        if args.instanceid:
            ips = cloud.ec2_start_from_instanceid(args.instanceid)
            instance_id = args.instanceid
        else:
            ips, instance_id = cloud.ec2_start_from_ami('run-fwk auto',
                                                        args.amiid,
                                                        int(args.ami_ram))

        # start DB ec2, from instanceid
        if args.pg_instance and is_pg_ec2:
            ips_pg = cloud.ec2_start_from_instanceid(args.pg_instance)
        else:
            ips_pg = {'ip_private': args.pg_instance}

    elif args.pg_instance:
        if is_pg_ec2:
            logging.error("The pg instance is set to an ec2 instance id,"
                          " but you are not running AWS.")
            exit(1)

        ips_pg = {
            'ip_public': args.pg_instance,
            'ip_private': args.pg_instance
        }

    # Infrastructure has been started
    # Try to run experiment, and if fails with exception,
    # still shut down infrastructure
    failed = False
    try:
        compute_node.host_node.host = ips['ip_public']
        compute_node.port = args.port

        # TEMPORARY HACK for ECS
        # Set the DB_HOST environment variable
        if args.pg_instance:
            os.putenv("DB_HOST", ips_pg['ip_private'])

        # 3) Sync code and run-home
        if args.sync:
            cloud.sync_experiment(compute_node.host_node)

        # 3.5) Prepare data and sync from S3 if necessary
        # This is typically used to download output files from
        # a previous experiment to be used as input
        if args.prepare_data_from_prefix:
            cloud.remote_download_output(args.prepare_data_from_prefix,
                                         compute_node.host_node)

        # 4) Launch Compute (remote or local)
        # *** IF Mode == 'Per Session' ***
        if ((LaunchMode.from_args(args) is LaunchMode.per_session) and
                args.launch_compute):
            compute_node.launch(experiment, cloud=cloud,
                                main_class=args.main_class,
                                no_local_docker=args.no_docker)

        # 5) Run experiments
        # This includes per experiment 'export results' and 'upload results'
        if args.exps_file:
            experiment.run_sweeps(compute_node, cloud, args)
            experiment.persist_prefix_history(cloud)

    except Exception as err:  # pylint: disable=W0703
        failed = True

        logging.error("Something failed running sweeps generally. If the "
                      "error occurred in a specific parameter set it should "
                      "have been caught there. Attempt to shut down "
                      "infrastructure if running, and exit.")
        logging.error(err)

        print('-'*60)
        traceback.print_exc(file=sys.stdout)
        print('-'*60)

        # Shutdown the Docker container
        print("Attempting to shutdown Docker container...")
        if host_node.remote() and compute_node.container_id:
            utils.remote_run(host_node,
                             'docker stop ' + compute_node.container_id)
        elif not host_node.remote() and not args.no_docker:
            utils.docker_stop()

    # 6) Shutdown framework
    if args.shutdown:
        if LaunchMode.from_args(args) is LaunchMode.per_session:
            compute_node.terminate()

        # Shutdown infrastructure
        if args.remote_type == "aws":
            cloud.ec2_stop(instance_id)

            if is_pg_ec2:
                cloud.ec2_stop(args.pg_instance)

    # Record experiment end time
    exp_end_time = datetime.now()

    # Log the experiment runtime in d:h:m:s:ms format
    exp_runtime = utils.format_timedelta(exp_end_time - exp_start_time)
    print("Experiment finished in %d days, %d hr, %d min, %d s" %
          tuple(exp_runtime))

    if failed:
        exit(1)
Example #8
0
    def run_sweeps(self, config, config_json, args, host_node):
        """Run the sweeps"""
        if args.phase == 'train':
            prefixes = []
            print('........ Training\n')
            hparams_sweeps = self._parse_hparams_sweeps(
                config['parameter-sweeps'])

            for i, hparams in enumerate(hparams_sweeps):
                run_prefix = datetime.datetime.now().strftime('%y%m%d-%H%M')
                prefixes.append(run_prefix)

                summary_dir = os.path.join(
                    config['experiment-parameters']['summary_dir'], run_prefix)

                # Start experiment
                utils.remote_run(
                    host_node,
                    self._train_op(host_node.remote_variables_file,
                                   config['experiment-parameters'],
                                   config['train-parameters'], summary_dir,
                                   hparams))

            with open('prefixes.txt', 'w') as prefix_file:
                prefix_file.write(','.join(prefixes))

        if args.phase == 'eval' or args.phase == 'classify':
            if args.prefixes is None:
                raise Exception('No prefixes provided.')

            prefixes = [x.strip() for x in args.prefixes.split(',')]

            for i, prefix in enumerate(prefixes):
                summary_dir = os.path.join(
                    config['experiment-parameters']['summary_dir'], prefix)

                if args.phase == 'eval':
                    # Export experiment for each prefix
                    print('........ Evaluating: {0}\n'.format(prefix))
                    for eval_sweep in config['eval-sweeps']:
                        utils.remote_run(
                            host_node,
                            self._eval_op(host_node.remote_variables_file,
                                          config['experiment-parameters'],
                                          config['train-parameters'],
                                          summary_dir, eval_sweep,
                                          hparams_sweeps[i]))

                if args.phase == 'classify':
                    # Classification
                    print('........ Classifying: {0}\n'.format(prefix))
                    for classify_sweep in config['classify-sweeps']:
                        for model in classify_sweep['model']:
                            utils.remote_run(
                                host_node,
                                self._classify_op(
                                    host_node.remote_variables_file,
                                    summary_dir, classify_sweep['dataset'],
                                    model,
                                    config['train-parameters']['max_steps'],
                                    config['experiment-parameters']['model']))