def cleanup_experiment(Experiment_name): try: experiment = Experiment.load(experiment_name=Experiment_name) for trial_summary in experiment.list_trials(): trial = Trial.load(trial_name=trial_summary.trial_name) for trial_component_summary in trial.list_trial_components(): tc = TrialComponent.load( trial_component_name=trial_component_summary. trial_component_name) trial.remove_trial_component(tc) try: # comment out to keep trial components tc.delete() except: # tc is associated with another trial continue # to prevent throttling time.sleep(.5) trial.delete() experiment_name = experiment.experiment_name experiment.delete() except Exception as ex: if 'ResourceNotFound' in str(ex): print('%s is a new experiment. Nothing to delete' % Experiment_name)
def cleanup_trial(Experiment_name, Trial_name): experiment = Experiment.load(experiment_name=Experiment_name) for trial_summary in experiment.list_trials(): trial = Trial.load(trial_name=trial_summary.trial_name) #print(trial_summary.trial_name) if trial_summary.trial_name == Trial_name: for trial_component_summary in trial.list_trial_components(): tc = TrialComponent.load( trial_component_name=trial_component_summary. trial_component_name) print(trial_component_summary.trial_component_name) trial.remove_trial_component(tc) try: # comment out to keep trial components tc.delete() except: # tc is associated with another trial continue # to prevent throttling time.sleep(.5) trial.delete()
def _test_training_function(ecr_image, sagemaker_session, instance_type, framework_version, py_version): if py_version is None or '2' in py_version: pytest.skip('Skipping python2 {}'.format(py_version)) return from smexperiments.experiment import Experiment from smexperiments.trial import Trial from smexperiments.trial_component import TrialComponent sm_client = sagemaker_session.sagemaker_client random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") unique_id = random.randint(1, 6000) experiment_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}" experiment = Experiment.create( experiment_name=experiment_name, description="Integration test experiment from sagemaker-tf-container", sagemaker_boto_client=sm_client, ) trial_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}" trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) training_job_name = utils.unique_name_from_base( "test-tf-experiments-mnist") # create a training job and wait for it to complete with timeout(minutes=15): resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") script = os.path.join(resource_path, "mnist", "mnist.py") estimator = TensorFlow( entry_point=script, role="SageMakerRole", instance_type=instance_type, instance_count=1, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, script_mode=True, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist") estimator.fit(inputs, job_name=training_job_name) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) trial_component_summary = trial_components[0] trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()
def test_training(sagemaker_session, ecr_image, instance_type, instance_count): from smexperiments.experiment import Experiment from smexperiments.trial import Trial from smexperiments.trial_component import TrialComponent sm_client = sagemaker_session.sagemaker_client experiment_name = "mxnet-container-integ-test-{}".format(int(time.time())) experiment = Experiment.create( experiment_name=experiment_name, description= "Integration test experiment from sagemaker-mxnet-container", sagemaker_boto_client=sm_client, ) trial_name = "mxnet-container-integ-test-{}".format(int(time.time())) trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) hyperparameters = { "random_seed": True, "num_steps": 50, "smdebug_path": "/opt/ml/output/tensors", "epochs": 1, } mx = MXNet( entry_point=SCRIPT_PATH, role="SageMakerRole", train_instance_count=instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters=hyperparameters, ) training_job_name = utils.unique_name_from_base("test-mxnet-image") # create a training job and wait for it to complete with timeout(minutes=15): prefix = "mxnet_mnist_gluon_basic_hook_demo/{}".format( utils.sagemaker_timestamp()) train_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, "train"), key_prefix=prefix + "/train") test_input = mx.sagemaker_session.upload_data( path=os.path.join(DATA_PATH, "test"), key_prefix=prefix + "/test") mx.fit({ "train": train_input, "test": test_input }, job_name=training_job_name, wait=False) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_component_summary = None attempts = 0 while True: trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) if len(trial_components) > 0: trial_component_summary = trial_components[0] break if attempts < 10: attempts += 1 sleep(500) assert trial_component_summary is not None trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()
def test_training(sagemaker_session, ecr_image, instance_type, framework_version): sm_client = sagemaker_session.sagemaker_client experiment_name = "tf-container-integ-test-{}".format(int(time.time())) experiment = Experiment.create( experiment_name=experiment_name, description="Integration test experiment from sagemaker-tf-container", sagemaker_boto_client=sm_client, ) trial_name = "tf-container-integ-test-{}".format(int(time.time())) trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) training_job_name = utils.unique_name_from_base( "test-tf-experiments-mnist") # create a training job and wait for it to complete with timeout(minutes=DEFAULT_TIMEOUT): resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") script = os.path.join(resource_path, "mnist", "mnist.py") estimator = TensorFlow( entry_point=script, role="SageMakerRole", train_instance_type=instance_type, train_instance_count=1, sagemaker_session=sagemaker_session, image_name=ecr_image, framework_version=framework_version, script_mode=True, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist") estimator.fit(inputs, job_name=training_job_name) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) trial_component_summary = trial_components[0] trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()
def _test_training_function(ecr_image, sagemaker_session, instance_type, framework_version): sm_client = sagemaker_session.sagemaker_client random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}") unique_id = random.randint(1, 6000) experiment_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}" experiment = Experiment.create( experiment_name=experiment_name, description="Integration test experiment from sagemaker-tf-container", sagemaker_boto_client=sm_client, ) trial_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}" trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) training_job_name = utils.unique_name_from_base( "test-tf-experiments-mnist") # create a training job and wait for it to complete with timeout(minutes=DEFAULT_TIMEOUT): resource_path = os.path.join(os.path.dirname(__file__), "..", "..", "resources") script = os.path.join(resource_path, "mnist", "mnist.py") estimator = TensorFlow( model_dir=False, entry_point=script, role="SageMakerRole", instance_type=instance_type, instance_count=1, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist") estimator.fit(inputs, job_name=training_job_name) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) trial_component_summary = trial_components[0] trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() # Prevent throttling to avoid deleting experiment before it's updated with trial deletion time.sleep(1.2) experiment.delete()
def test_training(sagemaker_session, ecr_image, instance_type): from smexperiments.experiment import Experiment from smexperiments.trial import Trial from smexperiments.trial_component import TrialComponent sm_client = sagemaker_session.sagemaker_client experiment_name = "pytorch-container-integ-test-{}".format(int( time.time())) experiment = Experiment.create( experiment_name=experiment_name, description= "Integration test full customer e2e from sagemaker-pytorch-container", sagemaker_boto_client=sm_client, ) trial_name = "pytorch-container-integ-test-{}".format(int(time.time())) trial = Trial.create(experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client) hyperparameters = { "random_seed": True, "num_steps": 50, "smdebug_path": "/opt/ml/output/tensors", "epochs": 1, "data_dir": training_dir, } training_job_name = utils.unique_name_from_base( "test-pytorch-experiments-image") # create a training job and wait for it to complete with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch( entry_point=smdebug_mnist_script, role="SageMakerRole", train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, image_name=ecr_image, hyperparameters=hyperparameters, ) training_input = pytorch.sagemaker_session.upload_data( path=training_dir, key_prefix="pytorch/mnist") pytorch.fit({"training": training_input}, job_name=training_job_name) training_job = sm_client.describe_training_job( TrainingJobName=training_job_name) training_job_arn = training_job["TrainingJobArn"] # verify trial component auto created from the training job trial_components = list( TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client)) trial_component_summary = trial_components[0] trial_component = TrialComponent.load( trial_component_name=trial_component_summary.trial_component_name, sagemaker_boto_client=sm_client, ) # associate the trial component with the trial trial.add_trial_component(trial_component) # cleanup trial.remove_trial_component(trial_component_summary.trial_component_name) trial_component.delete() trial.delete() experiment.delete()