def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_version, py_version): script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") if instance_type == 1 else os.path.join( DATA_DIR, "chainer_mnist", "distributed_mnist.py")) data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_version, py_version=py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, # test output_path without trailing slash output_path="s3://{}".format(sagemaker_session.default_bucket()), ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") job_name = unique_name_from_base("test-chainer-training") chainer.fit({"train": train_input, "test": test_input}, job_name=job_name) return chainer
def test_training_with_additional_hyperparameters(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO") train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') chainer.fit({'train': train_input, 'test': test_input}) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \ os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') chainer.fit({'train': train_input, 'test': test_input}, wait=wait) return chainer.latest_training_job.name
def test_training_with_additional_hyperparameters(sagemaker_local_session, chainer_latest_version, chainer_latest_py_version): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", instance_count=1, instance_type="local", framework_version=chainer_latest_version, py_version=chainer_latest_py_version, sagemaker_session=sagemaker_local_session, hyperparameters={"epochs": 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO", ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") chainer.fit({"train": train_input, "test": test_input})
def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type="ml.c4.xlarge", framework_version=chainer_full_version, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO") train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') job_name = unique_name_from_base('test-chainer-training') chainer.fit({ 'train': train_input, 'test': test_input }, job_name=job_name) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") if instance_type == 1 else os.path.join( DATA_DIR, "chainer_mnist", "distributed_mnist.py")) data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") job_name = unique_name_from_base("test-chainer-training") chainer.fit({ "train": train_input, "test": test_input }, wait=wait, job_name=job_name) return chainer
def main(parser=argparse.ArgumentParser()): import logging logging.basicConfig(level=logging.WARN) parser.add_argument('--profile', type=str) parser.add_argument('--local-mode', action='store_true') parser.add_argument('--instance-type', type=str, default=DEFAULT_INSTANCE_TYPE) parser.add_argument('--region', type=str, default=DEFAULT_REGION) parser.add_argument('--arn', type=str, default=None) parser.add_argument('--max-runtime', type=int, default=DEFAULT_RUNTIME, help='seconds') args = parser.parse_args() boto_session = Session(profile_name=args.profile, region_name=args.region) sagemaker_session = sagemaker.Session(boto_session=boto_session) role = args.arn if args.arn is not None else sagemaker.get_execution_role( sagemaker_session) hyperparameters = { 'gpu': 0 if args.instance_type.startswith('ml.p') else -1, 'mjcf': 'env/ant_simple.xml', 'action-dim': 8, 'obs-dim': 28, 'skip-step': 10, 'algorithm': 'TRPO', 'foot-list': 'right_back_foot left_back_foot front_right_foot front_left_foot' } chainer_estimator = Chainer( entry_point='train.py', source_dir='../', role=role, image_name=IMAGE, framework_version='5.0.0', sagemaker_session=LocalSession(boto_session) if args.local_mode else sagemaker_session, train_instance_count=1, train_instance_type='local' if args.local_mode else args.instance_type, hyperparameters=hyperparameters, base_job_name='roboschool-TRPO-skipstep10-continue-1', train_max_run=args.max_runtime) chainer_estimator.fit(wait=args.local_mode)
def test_failed_training_job(sagemaker_session, chainer_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') with pytest.raises(ValueError): chainer.fit(train_input)
def test_failed_training_job(sagemaker_session, chainer_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') with pytest.raises(ValueError): chainer.fit(train_input)
def test_attach_deploy(chainer_training_job, sagemaker_session): endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_async_fit(sagemaker_session): endpoint_name = 'test-chainer-attach-deploy-{}'.format( sagemaker_timestamp()) with timeout(minutes=5): training_job_name = _run_mnist_training_job( sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version=CHAINER_VERSION, wait=False) print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35): print("Re-attaching now to: %s" % training_job_name) estimator = Chainer.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_failed_training_job(sagemaker_session, chainer_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) with pytest.raises(ValueError) as e: chainer.fit() assert 'This failure is expected' in str(e.value)
def test_attach_deploy(chainer_training_job, sagemaker_session): endpoint_name = unique_name_from_base('test-chainer-attach-deploy') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Chainer.attach(chainer_training_job, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) _predict_and_assert(predictor)
def test_attach_deploy(sagemaker_session, chainer_latest_version, chainer_latest_py_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_latest_version, py_version=chainer_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train") test_input = sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test") job_name = unique_name_from_base("test-chainer-training") chainer.fit({ "train": train_input, "test": test_input }, wait=False, job_name=job_name) endpoint_name = unique_name_from_base("test-chainer-attach-deploy") with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Chainer.attach(chainer.latest_training_job.name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) _predict_and_assert(predictor)
def main(parser=argparse.ArgumentParser()): import logging logging.basicConfig(level=logging.WARN) parser.add_argument('--profile', type=str, default='default') parser.add_argument('--local-mode', action='store_true') parser.add_argument('--instance-type', type=str, default=DEFAULT_INSTANCE_TYPE) parser.add_argument('--region', type=str, default=DEFAULT_REGION) parser.add_argument('--arn', type=str, default=None) parser.add_argument('--max-runtime', type=int, default=DEFAULT_RUNTIME, help='seconds') args = parser.parse_args() boto_session = Session(profile_name=args.profile, region_name=args.region) sagemaker_session = LocalSession( boto_session) if args.local_mode else sagemaker.Session(boto_session) role = args.arn if args.arn is not None else sagemaker.get_execution_role( sagemaker_session) gpu = 0 if args.instance_type.startswith( 'ml.p') and not args.local_mode else -1 hyperparameters = { 'gpu': gpu, } chainer_estimator = Chainer( entry_point='train.py', source_dir='./', role=role, image_name=IMAGE, framework_version='5.0.0', sagemaker_session=sagemaker_session, train_instance_count=1, train_instance_type='local' if args.local_mode else args.instance_type, hyperparameters=hyperparameters, base_job_name='chainer-sagemaker-sample', train_max_run=args.max_runtime) chainer_estimator.fit(wait=args.local_mode)
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \ os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') chainer.fit({'train': train_input, 'test': test_input}, wait=wait) return chainer.latest_training_job.name
def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type="ml.c4.xlarge", framework_version=chainer_full_version, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO") train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') chainer.fit({'train': train_input, 'test': test_input}) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") if instance_type == 1 else os.path.join( DATA_DIR, "chainer_mnist", "distributed_mnist.py")) data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train") test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test") job_name = unique_name_from_base("test-chainer-training") chainer.fit({ "train": train_input, "test": test_input }, wait=wait, job_name=job_name) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \ os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') job_name = unique_name_from_base('test-chainer-training') chainer.fit({ 'train': train_input, 'test': test_input }, wait=wait, job_name=job_name) return chainer.latest_training_job.name
def test_async_fit(sagemaker_session): endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp()) with timeout(minutes=5): training_job_name = _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 1, chainer_full_version=CHAINER_VERSION, wait=False) print("Waiting to re-attach to the training job: %s" % training_job_name) time.sleep(20) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): print("Re-attaching now to: %s" % training_job_name) estimator = Chainer.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name) _predict_and_assert(predictor)
def exec_training(session, client, job_name, setting, pytorch, max_parallel_jobs): sagemaker_session = sagemaker.Session(boto_session=session, sagemaker_client=client) conf = yaml.load(open(setting)) # input data inputs = conf['inputs'] if 'upload_data' in conf and isinstance(conf['upload_data'], list): for d in conf['upload_data']: s3_dir = sagemaker_session.upload_data(path=d['path'], key_prefix=os.path.join( job_name, d['key_prefix'])) inputs[d['name']] = s3_dir estimator_args = conf['estimator'] estimator_args['sagemaker_session'] = sagemaker_session hyperparameters = estimator_args.pop('hyperparameters') fixed, targets = {}, {} for k, v in hyperparameters.items(): if isinstance(v, dict): targets[k] = v else: fixed[k] = v estimator_args['hyperparameters'] = fixed if pytorch: estimator = PyTorch(**estimator_args) else: estimator = Chainer(**estimator_args) if len(targets) == 0: estimator.fit(inputs, job_name=job_name) else: if 'tuner' in conf: tuner_args = conf['tuner'] hyperparameter_ranges = {} for k, v in targets.items(): hyperparameter_ranges[k] = hp_type[v['type'].lower()]( v['range']) else: # use default values tuner_args = { 'objective_metric_name': 'metric_name', 'metric_definitions': [{ 'Name': 'metric_name', 'Regex': 'ignore' }], 'strategy': 'Random', 'objective_type': 'Maximize', 'early_stopping_type': 'Off' } max_jobs = 1 hyperparameter_ranges = {} for k, v in targets.items(): if v['type'].lower() != 'categorical': raise ValueError( 'the default tuner only supports Categorigal params.') max_jobs *= len(v['range']) hyperparameter_ranges[k] = hp_type[v['type'].lower()]( v['range']) tuner_args['max_jobs'] = max_jobs tuner_args['estimator'] = estimator tuner_args['hyperparameter_ranges'] = hyperparameter_ranges tuner_args['max_parallel_jobs'] = max_parallel_jobs tuner_args['base_tuning_job_name'] = job_name tuner_args['warm_start_config'] = None # not supported yet. tuner = HyperparameterTuner(**tuner_args) tuner.fit(inputs, job_name=job_name)
entry_point="entry_point.py", ), RLEstimator( entry_point="cartpole.py", toolkit=RLToolkit.RAY, framework=RLFramework.TENSORFLOW, toolkit_version="0.8.5", role=sagemaker.get_execution_role(), instance_type=INSTANCE_TYPE, instance_count=1, ), Chainer( role=sagemaker.get_execution_role(), entry_point="entry_point.py", use_mpi=True, num_processes=4, framework_version="5.0.0", instance_type=INSTANCE_TYPE, instance_count=1, py_version="py3", ), ], ) def test_training_step_with_framework_estimator(estimator, pipeline_session, training_input, hyperparameters): estimator.source_dir = DUMMY_S3_SOURCE_DIR estimator.set_hyperparameters(**hyperparameters) estimator.volume_kms_key = "volume-kms-key" estimator.output_kms_key = "output-kms-key" estimator.dependencies = ["dep-1", "dep-2"]
def exec_training( session, client, job_name, setting, pytorch, max_parallel_jobs, is_spot ): sagemaker_session = sagemaker.Session( boto_session=session, sagemaker_client=client ) conf = yaml.load(open(setting)) # input data inputs = conf["inputs"] if "upload_data" in conf and isinstance(conf["upload_data"], list): for d in conf["upload_data"]: s3_dir = sagemaker_session.upload_data( path=d["path"], key_prefix=os.path.join(job_name, d["key_prefix"]), ) inputs[d["name"]] = s3_dir estimator_args = conf["estimator"] estimator_args["sagemaker_session"] = sagemaker_session hyperparameters = estimator_args.pop("hyperparameters") fixed, targets = {}, {} for k, v in hyperparameters.items(): if isinstance(v, dict): targets[k] = v else: fixed[k] = v estimator_args["hyperparameters"] = fixed if is_spot: estimator_args["train_use_spot_instances"] = True if "checkpoint_s3_uri" not in estimator_args: bucket_name = sagemaker_session.default_bucket() uri = os.path.join("s3://", bucket_name, job_name, "checkpoints") estimator_args["checkpoint_s3_uri"] = uri if pytorch: estimator = PyTorch(**estimator_args) else: estimator = Chainer(**estimator_args) if len(targets) == 0: estimator.fit(inputs, wait=False, job_name=job_name) else: if "tuner" in conf: tuner_args = conf["tuner"] hyperparameter_ranges = {} for k, v in targets.items(): hyperparameter_ranges[k] = hp_type[v["type"].lower()]( v["range"] ) else: # use default values tuner_args = { "objective_metric_name": "metric_name", "metric_definitions": [ {"Name": "metric_name", "Regex": "ignore"} ], "strategy": "Random", "objective_type": "Maximize", "early_stopping_type": "Off", } max_jobs = 1 hyperparameter_ranges = {} for k, v in targets.items(): if v["type"].lower() != "categorical": raise ValueError( "the default tuner only supports Categorigal params." ) max_jobs *= len(v["range"]) hyperparameter_ranges[k] = hp_type[v["type"].lower()]( v["range"] ) tuner_args["max_jobs"] = max_jobs tuner_args["estimator"] = estimator tuner_args["hyperparameter_ranges"] = hyperparameter_ranges tuner_args["max_parallel_jobs"] = max_parallel_jobs tuner_args["base_tuning_job_name"] = job_name tuner_args["warm_start_config"] = None # not supported yet. tuner = HyperparameterTuner(**tuner_args) tuner.fit(inputs, job_name=job_name)