def shutdown_compute(self, cloud, args, task_arn): """ Close compute: terminate and then if running on AWS ECS, stop the task. """ print("\n....... Shutdown System") self.terminate() # Note that container should be set up to terminate once compute has # been terminated. However, it doesn't hurt to clean up by making sure # that the container is killed # if ecs, then stop the task if args.remote_type == "aws" and (task_arn is not None): cloud.ecs_stop_task(task_arn) # if it's remote and docker, then kill container if cloud and self.remote(): # ensure we have the container id if self.container_id: utils.remote_run(self.host_node, 'docker stop ' + self.container_id) else: logging.warning("Docker did not shut down, could not " + "locate container id") elif not args.no_docker: # stops local docker utils.docker_stop()
def _exec_experiment(self, host_node, experiment_id, experiment_prefix, config_json, param_sweeps=None): utils.remote_run( host_node, self._run_command(host_node, experiment_id, experiment_prefix, config_json, param_sweeps)) if self.export: utils.remote_run( host_node, self._upload_command(host_node, experiment_id, experiment_prefix))
def _create_experiment(self, host_node): """Creates new MLFlow experiment remotely.""" experiment_prefix = datetime.datetime.now().strftime('%y%m%d-%H%M') if self.use_docker: # pylint: disable=anomalous-backslash-in-string command = ''' docker exec -it {docker_id} bash -c " source activate {anaenv} export LC_ALL=C.UTF-8 export LANG=C.UTF-8 export RUN_DIR=$HOME/agief-remote-run pip install -e \$RUN_DIR/pagi --quiet pagi --help pip install -e \$RUN_DIR/rsm --quiet pip install -e \$RUN_DIR/{project} --quiet cd \$RUN_DIR/{project} mlflow experiments create {prefix} " '''.format(anaenv='tensorflow', prefix=experiment_prefix, docker_id=self.docker_id, project=self.project) else: command = ''' source {remote_env} {anaenv} export RUN_DIR=$HOME/agief-remote-run pip install -e $RUN_DIR/pagi --quiet pip install -e $RUN_DIR/{project} --quiet cd $RUN_DIR/{project} mlflow experiments create {prefix} '''.format(anaenv='tensorflow', remote_env=host_node.remote_env_path, prefix=experiment_prefix, project=self.project) remote_output = utils.remote_run(host_node, command) command_output = [ s for s in remote_output if 'Created experiment' in s ] command_output = command_output[0].strip().split(' ') experiment_id = int(command_output[-1]) return experiment_id, experiment_prefix
def remote_docker_launch_compute(self, host_node): """ Assumes there exists a private key for the given ec2 instance, at keypath """ print("\n....... Launch compute node in a docker container " "on a remote host.") commands = ''' export VARIABLES_FILE={0} source {0} cd $AGI_HOME/bin/node_coordinator ./run-in-docker.sh -d '''.format(host_node.remote_variables_file) return utils.remote_run(host_node, commands)
def _launch_docker(self, host_node): """Launch the Docker container on the remote machine.""" assert self.docker_image is not None, 'Docker image not provided.' command = ''' export RUN_DIR=$HOME/agief-remote-run docker pull {docker_image} docker run -d -t --runtime=nvidia --ipc=host --mount type=bind,source=$RUN_DIR,target=$RUN_DIR \ {docker_image} bash '''.format(docker_image=self.docker_image) remote_output = utils.remote_run(host_node, command) command_output = [s for s in remote_output] command_output = command_output[-1].strip() self.docker_id = command_output return command_output
def run_sweeps(self, compute_node, cloud, args): """ Perform parameter sweep steps, and run experiment for each step. """ print("\n........ Run Sweeps") exps_filename = self.experiment_utils.experiment_def_file() if not os.path.exists(exps_filename): msg = "Experiment file does not exist at: " + exps_filename raise Exception(msg) # Silently remove older log file if exists log_filepath = self.experiment_utils.runpath(self.LOG_FILENAME) if compute_node.remote(): utils.remote_run(compute_node.host_node, 'rm ' + log_filepath) else: utils.remove_file(log_filepath, True) with open(exps_filename) as exps_file: filedata = json.load(exps_file) for exp_i in filedata['experiments']: import_files = exp_i['import-files'] # import files dictionary logging.debug("Import Files Dictionary = \n" + json.dumps(import_files, indent=4)) base_entity_filename = import_files['file-entities'] base_data_filenames = import_files['file-data'] exp_ll_data_filepaths = [] if 'load-local-files' in exp_i: load_local_files = exp_i['load-local-files'] if 'file-data' in load_local_files: exp_ll_data_filepaths = list( map(self.experiment_utils.runpath, load_local_files['file-data']) ) run_parameterset_partial = ( functools.partial( self.run_parameterset, compute_node=compute_node, cloud=cloud, args=args, compute_data_filepaths=exp_ll_data_filepaths) ) if 'parameter-sweeps' not in exp_i or ( len(exp_i['parameter-sweeps']) == 0): print("No parameters to sweep, just run once.") exp_entity_filepath, exp_data_filepaths = ( self.create_all_input_files(base_entity_filename, base_data_filenames) ) run_parameterset_partial(entity_filepath=exp_entity_filepath, data_filepaths=exp_data_filepaths) else: # array of sweep definitions for param_sweep in exp_i['parameter-sweeps']: counters = self.setup_parameter_sweepers(param_sweep) while True: exp_entity_filepath, exp_data_filepaths = ( self.create_all_input_files( base_entity_filename, base_data_filenames) ) reset, sweep_param_vals = self.inc_parameter_set( compute_node, args, exp_entity_filepath, counters ) if reset: break run_parameterset_partial( entity_filepath=exp_entity_filepath, data_filepaths=exp_data_filepaths, sweep_param_vals=sweep_param_vals )
def main(): """ The main scope of the run-framework containing the high level code """ print("------------------------------------------") print("---- run-framework ----") print("------------------------------------------") # Record experiment start time exp_start_time = datetime.now() args = setup_arg_parsing() # setup logging log_format = ("[%(filename)s:%(lineno)s - %(funcName)s() " + "- %(levelname)s] %(message)s") logging.basicConfig(format=log_format, level=utils.logger_level(args.logging)) logging.debug("Python Version: " + sys.version) logging.debug("Arguments: %s", args) exps_file = args.exps_file if args.exps_file else "" experiment = Experiment(args.debug_no_run, LaunchMode.from_args(args), exps_file, args.no_compress, args.csv_output) # 1) Generate input files if args.main_class: compute_node = Compute(host_node=HostNode(), port=args.port) compute_node.launch(experiment, main_class=args.main_class, no_local_docker=args.no_docker) experiment.generate_input_files_locally(compute_node) compute_node.terminate() return # *) all other use cases (non Generate input files) cloud = Cloud() if args.upload and not (args.export or args.export_compute): logging.warning("Uploading experiment to S3 is enabled, but " + "'export experiment' is not, so the most important " + "files (output entity.json and data.json) " + "will be missing") if args.remote_type != "local": host_node = HostNode(args.host, args.user, args.ssh_keypath, args.remote_variables_file, args.ssh_port) else: host_node = HostNode(args.host, args.user) compute_node = Compute(host_node, args.port) check_args(args, compute_node) # 2) Setup infrastructure (on AWS or nothing to do locally) ips = {'ip_public': args.host, 'ip_private': None} ips_pg = {'ip_public': None, 'ip_private': None} instance_id = None is_pg_ec2 = args.pg_instance and args.pg_instance[:2] == 'i-' if args.remote_type == "aws": # start Compute ec2 either from instanceid or amiid if args.instanceid: ips = cloud.ec2_start_from_instanceid(args.instanceid) instance_id = args.instanceid else: ips, instance_id = cloud.ec2_start_from_ami('run-fwk auto', args.amiid, int(args.ami_ram)) # start DB ec2, from instanceid if args.pg_instance and is_pg_ec2: ips_pg = cloud.ec2_start_from_instanceid(args.pg_instance) else: ips_pg = {'ip_private': args.pg_instance} elif args.pg_instance: if is_pg_ec2: logging.error("The pg instance is set to an ec2 instance id," " but you are not running AWS.") exit(1) ips_pg = { 'ip_public': args.pg_instance, 'ip_private': args.pg_instance } # Infrastructure has been started # Try to run experiment, and if fails with exception, # still shut down infrastructure failed = False try: compute_node.host_node.host = ips['ip_public'] compute_node.port = args.port # TEMPORARY HACK for ECS # Set the DB_HOST environment variable if args.pg_instance: os.putenv("DB_HOST", ips_pg['ip_private']) # 3) Sync code and run-home if args.sync: cloud.sync_experiment(compute_node.host_node) # 3.5) Prepare data and sync from S3 if necessary # This is typically used to download output files from # a previous experiment to be used as input if args.prepare_data_from_prefix: cloud.remote_download_output(args.prepare_data_from_prefix, compute_node.host_node) # 4) Launch Compute (remote or local) # *** IF Mode == 'Per Session' *** if ((LaunchMode.from_args(args) is LaunchMode.per_session) and args.launch_compute): compute_node.launch(experiment, cloud=cloud, main_class=args.main_class, no_local_docker=args.no_docker) # 5) Run experiments # This includes per experiment 'export results' and 'upload results' if args.exps_file: experiment.run_sweeps(compute_node, cloud, args) experiment.persist_prefix_history(cloud) except Exception as err: # pylint: disable=W0703 failed = True logging.error("Something failed running sweeps generally. If the " "error occurred in a specific parameter set it should " "have been caught there. Attempt to shut down " "infrastructure if running, and exit.") logging.error(err) print('-'*60) traceback.print_exc(file=sys.stdout) print('-'*60) # Shutdown the Docker container print("Attempting to shutdown Docker container...") if host_node.remote() and compute_node.container_id: utils.remote_run(host_node, 'docker stop ' + compute_node.container_id) elif not host_node.remote() and not args.no_docker: utils.docker_stop() # 6) Shutdown framework if args.shutdown: if LaunchMode.from_args(args) is LaunchMode.per_session: compute_node.terminate() # Shutdown infrastructure if args.remote_type == "aws": cloud.ec2_stop(instance_id) if is_pg_ec2: cloud.ec2_stop(args.pg_instance) # Record experiment end time exp_end_time = datetime.now() # Log the experiment runtime in d:h:m:s:ms format exp_runtime = utils.format_timedelta(exp_end_time - exp_start_time) print("Experiment finished in %d days, %d hr, %d min, %d s" % tuple(exp_runtime)) if failed: exit(1)
def run_sweeps(self, config, config_json, args, host_node): """Run the sweeps""" if args.phase == 'train': prefixes = [] print('........ Training\n') hparams_sweeps = self._parse_hparams_sweeps( config['parameter-sweeps']) for i, hparams in enumerate(hparams_sweeps): run_prefix = datetime.datetime.now().strftime('%y%m%d-%H%M') prefixes.append(run_prefix) summary_dir = os.path.join( config['experiment-parameters']['summary_dir'], run_prefix) # Start experiment utils.remote_run( host_node, self._train_op(host_node.remote_variables_file, config['experiment-parameters'], config['train-parameters'], summary_dir, hparams)) with open('prefixes.txt', 'w') as prefix_file: prefix_file.write(','.join(prefixes)) if args.phase == 'eval' or args.phase == 'classify': if args.prefixes is None: raise Exception('No prefixes provided.') prefixes = [x.strip() for x in args.prefixes.split(',')] for i, prefix in enumerate(prefixes): summary_dir = os.path.join( config['experiment-parameters']['summary_dir'], prefix) if args.phase == 'eval': # Export experiment for each prefix print('........ Evaluating: {0}\n'.format(prefix)) for eval_sweep in config['eval-sweeps']: utils.remote_run( host_node, self._eval_op(host_node.remote_variables_file, config['experiment-parameters'], config['train-parameters'], summary_dir, eval_sweep, hparams_sweeps[i])) if args.phase == 'classify': # Classification print('........ Classifying: {0}\n'.format(prefix)) for classify_sweep in config['classify-sweeps']: for model in classify_sweep['model']: utils.remote_run( host_node, self._classify_op( host_node.remote_variables_file, summary_dir, classify_sweep['dataset'], model, config['train-parameters']['max_steps'], config['experiment-parameters']['model']))