def test_github(sagemaker_local_session, pytorch_inference_latest_version, pytorch_inference_latest_py_version): script_path = "mnist.py" git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT} pytorch = PyTorch( entry_point=script_path, role="SageMakerRole", source_dir="pytorch", framework_version=pytorch_inference_latest_version, py_version=pytorch_inference_latest_py_version, instance_count=1, instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) data_path = os.path.join(DATA_DIR, "pytorch_mnist") pytorch.fit({"training": "file://" + os.path.join(data_path, "training")}) with lock.lock(LOCK_PATH): try: predictor = pytorch.deploy(initial_instance_count=1, instance_type="local") data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32) result = predictor.predict(data) assert 10 == len( result[0]) # check that there is a probability for each label finally: predictor.delete_endpoint()
def test_source_dirs(tmpdir, sagemaker_local_session): source_dir = os.path.join(DATA_DIR, "pytorch_source_dirs") lib = os.path.join(str(tmpdir), "alexa.py") with open(lib, "w") as f: f.write("def question(to_anything): return 42") estimator = PyTorch( entry_point="train.py", role="SageMakerRole", source_dir=source_dir, dependencies=[lib], py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, ) estimator.fit() # endpoint tests all use the same port, so we use this lock to prevent concurrent execution with lock.lock(): try: predictor = estimator.deploy(initial_instance_count=1, instance_type="local") predict_response = predictor.predict([7]) assert predict_response == [49] finally: estimator.delete_endpoint()
def test_github(sagemaker_local_session): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "pytorch_mnist") git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT} pytorch = PyTorch( entry_point=script_path, role="SageMakerRole", source_dir="pytorch", framework_version=PYTORCH_VERSION, py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, git_config=git_config, ) pytorch.fit({ "training": "file://" + os.path.join(data_path, "training", MNIST_FOLDER_NAME) }) with lock.lock(LOCK_PATH): try: predictor = pytorch.deploy(initial_instance_count=1, instance_type="local") data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32) result = predictor.predict(data) assert result is not None finally: predictor.delete_endpoint()
def test_source_dirs(tmpdir, sagemaker_local_session): source_dir = os.path.join(DATA_DIR, 'pytorch_source_dirs') lib = os.path.join(str(tmpdir), 'alexa.py') with open(lib, 'w') as f: f.write('def question(to_anything): return 42') estimator = PyTorch(entry_point='train.py', role='SageMakerRole', source_dir=source_dir, dependencies=[lib], py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session) try: estimator.fit() predictor = estimator.deploy(initial_instance_count=1, instance_type='local') predict_response = predictor.predict([7]) assert predict_response == [49] finally: estimator.delete_endpoint()
def test_source_dirs(tmpdir, sagemaker_local_session): source_dir = os.path.join(DATA_DIR, "pytorch_source_dirs") lib = os.path.join(str(tmpdir), "alexa.py") with open(lib, "w") as f: f.write("def question(to_anything): return 42") # TODO: fails on newer versions of pytorch in call to np.load(BytesIO(stream.read())) # "ValueError: Cannot load file containing pickled data when allow_pickle=False" estimator = PyTorch( entry_point="train.py", role="SageMakerRole", source_dir=source_dir, dependencies=[lib], framework_version= "0.4", # hard-code to last known good pytorch for now (see TODO above) py_version="py3", instance_count=1, instance_type="local", sagemaker_session=sagemaker_local_session, ) estimator.fit() # endpoint tests all use the same port, so we use this lock to prevent concurrent execution with lock.lock(): try: predictor = estimator.deploy(initial_instance_count=1, instance_type="local") predict_response = predictor.predict([7]) assert predict_response == [49] finally: predictor.delete_endpoint()
def test_fit_deploy(sagemaker_local_session, pytorch_full_version): pytorch = PyTorch( entry_point=MNIST_SCRIPT, role="SageMakerRole", framework_version=pytorch_full_version, py_version="py3", train_instance_count=1, train_instance_type="local", sagemaker_session=sagemaker_local_session, ) pytorch.fit({"training": "file://" + os.path.join(MNIST_DIR, "training")}) predictor = pytorch.deploy(1, "local") try: batch_size = 100 data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32) output = predictor.predict(data) assert output.shape == (batch_size, 10) finally: predictor.delete_endpoint()
testpath = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer') pytorch_estimator = PyTorch( entry_point='train.py', source_dir=os.path.abspath(os.path.dirname(__file__)), role=sm_role, train_instance_count=1, train_instance_type='ml.c5.xlarge', framework_version='1.0.0', base_job_name='dense-pytorch', metric_definitions=[{ 'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$" }], hyperparameters={ 'n-epochs': 1500, 'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT', 'target': 'target' }, tags=[{ "Key": "CostCentre", "Value": "SageMaker" }]) # launch training job, with asynchronous call pytorch_estimator.fit({'train': trainpath, 'test': testpath}, wait=False)
def exec_training( session, client, job_name, setting, pytorch, max_parallel_jobs, is_spot ): sagemaker_session = sagemaker.Session( boto_session=session, sagemaker_client=client ) conf = yaml.load(open(setting)) # input data inputs = conf["inputs"] if "upload_data" in conf and isinstance(conf["upload_data"], list): for d in conf["upload_data"]: s3_dir = sagemaker_session.upload_data( path=d["path"], key_prefix=os.path.join(job_name, d["key_prefix"]), ) inputs[d["name"]] = s3_dir estimator_args = conf["estimator"] estimator_args["sagemaker_session"] = sagemaker_session hyperparameters = estimator_args.pop("hyperparameters") fixed, targets = {}, {} for k, v in hyperparameters.items(): if isinstance(v, dict): targets[k] = v else: fixed[k] = v estimator_args["hyperparameters"] = fixed if is_spot: estimator_args["train_use_spot_instances"] = True if "checkpoint_s3_uri" not in estimator_args: bucket_name = sagemaker_session.default_bucket() uri = os.path.join("s3://", bucket_name, job_name, "checkpoints") estimator_args["checkpoint_s3_uri"] = uri if pytorch: estimator = PyTorch(**estimator_args) else: estimator = Chainer(**estimator_args) if len(targets) == 0: estimator.fit(inputs, wait=False, job_name=job_name) else: if "tuner" in conf: tuner_args = conf["tuner"] hyperparameter_ranges = {} for k, v in targets.items(): hyperparameter_ranges[k] = hp_type[v["type"].lower()]( v["range"] ) else: # use default values tuner_args = { "objective_metric_name": "metric_name", "metric_definitions": [ {"Name": "metric_name", "Regex": "ignore"} ], "strategy": "Random", "objective_type": "Maximize", "early_stopping_type": "Off", } max_jobs = 1 hyperparameter_ranges = {} for k, v in targets.items(): if v["type"].lower() != "categorical": raise ValueError( "the default tuner only supports Categorigal params." ) max_jobs *= len(v["range"]) hyperparameter_ranges[k] = hp_type[v["type"].lower()]( v["range"] ) tuner_args["max_jobs"] = max_jobs tuner_args["estimator"] = estimator tuner_args["hyperparameter_ranges"] = hyperparameter_ranges tuner_args["max_parallel_jobs"] = max_parallel_jobs tuner_args["base_tuning_job_name"] = job_name tuner_args["warm_start_config"] = None # not supported yet. tuner = HyperparameterTuner(**tuner_args) tuner.fit(inputs, job_name=job_name)
def my_aws_app(cfg: DictConfig) -> None: script_folder = "." # todo. this is overriden by hydra script_folder = (hydra.utils.get_original_cwd() ) # todo. this is overriden by hydra as_dict = OmegaConf.to_container(cfg, resolve=False) # Override s3 datapath aws_bucket = cfg.aws.bucket_prefix try: aws_root_path = aws_bucket + cfg.aws.root_path except errors.ConfigAttributeError: aws_root_path = aws_bucket + cfg.root_path # Get the s3 location to load /save to aws_out_path = aws_root_path + "/" + as_dict["output_subdir"] aws_data_path = aws_root_path + "/" + as_dict["data_subdir"] # Override the job json file with sagemaker local dirs as_dict["root_path"] = "/opt/ml/" as_dict["data_subdir"] = "input/data/train" as_dict["output_subdir"] = "output/data" # Set the local dir for tensorboard tb_log_dir = "/opt/ml/output/tensorboard/" as_dict["tb_log_dir"] = tb_log_dir tensorboard_output_config = TensorBoardOutputConfig( s3_output_path=aws_out_path, container_local_output_path=tb_log_dir, ) print(OmegaConf.to_yaml(cfg)) print("Overriden Root Path: " + aws_root_path) # Save json file to tmp location to be uploaded with script tmp_relative_path = "tmp/tmp_job.json" tmp_path = script_folder + "/" + tmp_relative_path with open(tmp_path, "w") as json_file: json.dump(as_dict, json_file) wait = cfg.aws.wait role = cfg.aws.role instance_count = cfg.aws.instance_count instance_type = cfg.aws.instance_type env = { "SAGEMAKER_REQUIREMENTS": "requirements.txt", # path relative to `source_dir` below. } # Using Sagemaker prebuilt Pytorch container pytorch_estimator = PyTorch( entry_point="run.py", source_dir=script_folder, hyperparameters={"config_file": tmp_relative_path}, role=role, env=env, instance_count=instance_count, py_version="py3", framework_version="1.5.0", output_path=aws_out_path, base_job_name=cfg.experiment_name, instance_type=instance_type, tensorboard_output_config=tensorboard_output_config, ) pytorch_estimator.fit({"train": aws_data_path}, wait=wait) os.remove(tmp_path)
def exec_training(session, client, job_name, setting, pytorch, max_parallel_jobs): sagemaker_session = sagemaker.Session(boto_session=session, sagemaker_client=client) conf = yaml.load(open(setting)) # input data inputs = conf['inputs'] if 'upload_data' in conf and isinstance(conf['upload_data'], list): for d in conf['upload_data']: s3_dir = sagemaker_session.upload_data(path=d['path'], key_prefix=os.path.join( job_name, d['key_prefix'])) inputs[d['name']] = s3_dir estimator_args = conf['estimator'] estimator_args['sagemaker_session'] = sagemaker_session hyperparameters = estimator_args.pop('hyperparameters') fixed, targets = {}, {} for k, v in hyperparameters.items(): if isinstance(v, dict): targets[k] = v else: fixed[k] = v estimator_args['hyperparameters'] = fixed if pytorch: estimator = PyTorch(**estimator_args) else: estimator = Chainer(**estimator_args) if len(targets) == 0: estimator.fit(inputs, job_name=job_name) else: if 'tuner' in conf: tuner_args = conf['tuner'] hyperparameter_ranges = {} for k, v in targets.items(): hyperparameter_ranges[k] = hp_type[v['type'].lower()]( v['range']) else: # use default values tuner_args = { 'objective_metric_name': 'metric_name', 'metric_definitions': [{ 'Name': 'metric_name', 'Regex': 'ignore' }], 'strategy': 'Random', 'objective_type': 'Maximize', 'early_stopping_type': 'Off' } max_jobs = 1 hyperparameter_ranges = {} for k, v in targets.items(): if v['type'].lower() != 'categorical': raise ValueError( 'the default tuner only supports Categorigal params.') max_jobs *= len(v['range']) hyperparameter_ranges[k] = hp_type[v['type'].lower()]( v['range']) tuner_args['max_jobs'] = max_jobs tuner_args['estimator'] = estimator tuner_args['hyperparameter_ranges'] = hyperparameter_ranges tuner_args['max_parallel_jobs'] = max_parallel_jobs tuner_args['base_tuning_job_name'] = job_name tuner_args['warm_start_config'] = None # not supported yet. tuner = HyperparameterTuner(**tuner_args) tuner.fit(inputs, job_name=job_name)