def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type="ml.c4.xlarge", framework_version=chainer_full_version, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO") train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') job_name = unique_name_from_base('test-chainer-training') chainer.fit({ 'train': train_input, 'test': test_input }, job_name=job_name) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_version, py_version): script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") if instance_type == 1 else os.path.join( DATA_DIR, "chainer_mnist", "distributed_mnist.py")) data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_version, py_version=py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, # test output_path without trailing slash output_path="s3://{}".format(sagemaker_session.default_bucket()), ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") job_name = unique_name_from_base("test-chainer-training") chainer.fit({"train": train_input, "test": test_input}, job_name=job_name) return chainer
def test_training_with_additional_hyperparameters(sagemaker_local_session, chainer_latest_version, chainer_latest_py_version): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", instance_count=1, instance_type="local", framework_version=chainer_latest_version, py_version=chainer_latest_py_version, sagemaker_session=sagemaker_local_session, hyperparameters={"epochs": 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO", ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") chainer.fit({"train": train_input, "test": test_input})
def test_training_with_additional_hyperparameters(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO") train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') chainer.fit({'train': train_input, 'test': test_input}) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \ os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') chainer.fit({'train': train_input, 'test': test_input}, wait=wait) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") if instance_type == 1 else os.path.join( DATA_DIR, "chainer_mnist", "distributed_mnist.py")) data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = "file://" + os.path.join(data_path, "train") test_input = "file://" + os.path.join(data_path, "test") job_name = unique_name_from_base("test-chainer-training") chainer.fit({ "train": train_input, "test": test_input }, wait=wait, job_name=job_name) return chainer
def main(parser=argparse.ArgumentParser()): import logging logging.basicConfig(level=logging.WARN) parser.add_argument('--profile', type=str) parser.add_argument('--local-mode', action='store_true') parser.add_argument('--instance-type', type=str, default=DEFAULT_INSTANCE_TYPE) parser.add_argument('--region', type=str, default=DEFAULT_REGION) parser.add_argument('--arn', type=str, default=None) parser.add_argument('--max-runtime', type=int, default=DEFAULT_RUNTIME, help='seconds') args = parser.parse_args() boto_session = Session(profile_name=args.profile, region_name=args.region) sagemaker_session = sagemaker.Session(boto_session=boto_session) role = args.arn if args.arn is not None else sagemaker.get_execution_role( sagemaker_session) hyperparameters = { 'gpu': 0 if args.instance_type.startswith('ml.p') else -1, 'mjcf': 'env/ant_simple.xml', 'action-dim': 8, 'obs-dim': 28, 'skip-step': 10, 'algorithm': 'TRPO', 'foot-list': 'right_back_foot left_back_foot front_right_foot front_left_foot' } chainer_estimator = Chainer( entry_point='train.py', source_dir='../', role=role, image_name=IMAGE, framework_version='5.0.0', sagemaker_session=LocalSession(boto_session) if args.local_mode else sagemaker_session, train_instance_count=1, train_instance_type='local' if args.local_mode else args.instance_type, hyperparameters=hyperparameters, base_job_name='roboschool-TRPO-skipstep10-continue-1', train_max_run=args.max_runtime) chainer_estimator.fit(wait=args.local_mode)
def test_failed_training_job(sagemaker_session, chainer_full_version): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') with pytest.raises(ValueError): chainer.fit(train_input)
def test_failed_training_job(sagemaker_session, chainer_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') with pytest.raises(ValueError): chainer.fit(train_input)
def test_failed_training_job(sagemaker_session, chainer_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session) with pytest.raises(ValueError) as e: chainer.fit() assert 'This failure is expected' in str(e.value)
def test_attach_deploy(sagemaker_session, chainer_latest_version, chainer_latest_py_version, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_latest_version, py_version=chainer_latest_py_version, instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train") test_input = sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test") job_name = unique_name_from_base("test-chainer-training") chainer.fit({ "train": train_input, "test": test_input }, wait=False, job_name=job_name) endpoint_name = unique_name_from_base("test-chainer-attach-deploy") with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Chainer.attach(chainer.latest_training_job.name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(1, cpu_instance_type, endpoint_name=endpoint_name) _predict_and_assert(predictor)
def main(parser=argparse.ArgumentParser()): import logging logging.basicConfig(level=logging.WARN) parser.add_argument('--profile', type=str, default='default') parser.add_argument('--local-mode', action='store_true') parser.add_argument('--instance-type', type=str, default=DEFAULT_INSTANCE_TYPE) parser.add_argument('--region', type=str, default=DEFAULT_REGION) parser.add_argument('--arn', type=str, default=None) parser.add_argument('--max-runtime', type=int, default=DEFAULT_RUNTIME, help='seconds') args = parser.parse_args() boto_session = Session(profile_name=args.profile, region_name=args.region) sagemaker_session = LocalSession( boto_session) if args.local_mode else sagemaker.Session(boto_session) role = args.arn if args.arn is not None else sagemaker.get_execution_role( sagemaker_session) gpu = 0 if args.instance_type.startswith( 'ml.p') and not args.local_mode else -1 hyperparameters = { 'gpu': gpu, } chainer_estimator = Chainer( entry_point='train.py', source_dir='./', role=role, image_name=IMAGE, framework_version='5.0.0', sagemaker_session=sagemaker_session, train_instance_count=1, train_instance_type='local' if args.local_mode else args.instance_type, hyperparameters=hyperparameters, base_job_name='chainer-sagemaker-sample', train_max_run=args.max_runtime) chainer_estimator.fit(wait=args.local_mode)
def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', train_instance_count=1, train_instance_type="ml.c4.xlarge", framework_version=chainer_full_version, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}, use_mpi=True, num_processes=2, process_slots_per_host=2, additional_mpi_options="-x NCCL_DEBUG=INFO") train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') chainer.fit({'train': train_input, 'test': test_input}) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \ os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') chainer.fit({'train': train_input, 'test': test_input}, wait=wait) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py") if instance_type == 1 else os.path.join( DATA_DIR, "chainer_mnist", "distributed_mnist.py")) data_path = os.path.join(DATA_DIR, "chainer_mnist") chainer = Chainer( entry_point=script_path, role="SageMakerRole", framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={"epochs": 1}, ) train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, "train"), key_prefix="integ-test-data/chainer_mnist/train") test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, "test"), key_prefix="integ-test-data/chainer_mnist/test") job_name = unique_name_from_base("test-chainer-training") chainer.fit({ "train": train_input, "test": test_input }, wait=wait, job_name=job_name) return chainer.latest_training_job.name
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count, chainer_full_version, wait=True): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \ os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py') data_path = os.path.join(DATA_DIR, 'chainer_mnist') chainer = Chainer(entry_point=script_path, role='SageMakerRole', framework_version=chainer_full_version, py_version=PYTHON_VERSION, train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, hyperparameters={'epochs': 1}) train_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/chainer_mnist/train') test_input = chainer.sagemaker_session.upload_data( path=os.path.join(data_path, 'test'), key_prefix='integ-test-data/chainer_mnist/test') job_name = unique_name_from_base('test-chainer-training') chainer.fit({ 'train': train_input, 'test': test_input }, wait=wait, job_name=job_name) return chainer.latest_training_job.name