def cleanup_experiment(Experiment_name):
    try:
        experiment = Experiment.load(experiment_name=Experiment_name)
        for trial_summary in experiment.list_trials():
            trial = Trial.load(trial_name=trial_summary.trial_name)
            for trial_component_summary in trial.list_trial_components():
                tc = TrialComponent.load(
                    trial_component_name=trial_component_summary.
                    trial_component_name)
                trial.remove_trial_component(tc)
                try:
                    # comment out to keep trial components
                    tc.delete()
                except:
                    # tc is associated with another trial
                    continue
                # to prevent throttling
                time.sleep(.5)
            trial.delete()
            experiment_name = experiment.experiment_name
        experiment.delete()
    except Exception as ex:
        if 'ResourceNotFound' in str(ex):
            print('%s is a new experiment. Nothing to delete' %
                  Experiment_name)
def cleanup_trial(Experiment_name, Trial_name):
    experiment = Experiment.load(experiment_name=Experiment_name)
    for trial_summary in experiment.list_trials():
        trial = Trial.load(trial_name=trial_summary.trial_name)
        #print(trial_summary.trial_name)
        if trial_summary.trial_name == Trial_name:
            for trial_component_summary in trial.list_trial_components():
                tc = TrialComponent.load(
                    trial_component_name=trial_component_summary.
                    trial_component_name)
                print(trial_component_summary.trial_component_name)
                trial.remove_trial_component(tc)
                try:
                    # comment out to keep trial components
                    tc.delete()
                except:
                    # tc is associated with another trial
                    continue
                # to prevent throttling
                time.sleep(.5)
            trial.delete()
Beispiel #3
0
def _test_training_function(ecr_image, sagemaker_session, instance_type,
                            framework_version, py_version):
    if py_version is None or '2' in py_version:
        pytest.skip('Skipping python2 {}'.format(py_version))
        return

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_id = random.randint(1, 6000)

    experiment_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description="Integration test experiment from sagemaker-tf-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    training_job_name = utils.unique_name_from_base(
        "test-tf-experiments-mnist")

    # create a training job and wait for it to complete
    with timeout(minutes=15):
        resource_path = os.path.join(os.path.dirname(__file__), "..", "..",
                                     "resources")
        script = os.path.join(resource_path, "mnist", "mnist.py")
        estimator = TensorFlow(
            entry_point=script,
            role="SageMakerRole",
            instance_type=instance_type,
            instance_count=1,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
            script_mode=True,
        )
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "mnist", "data"),
            key_prefix="scriptmode/mnist")
        estimator.fit(inputs, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
Beispiel #4
0
def test_training(sagemaker_session, ecr_image, instance_type, instance_count):

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "mxnet-container-integ-test-{}".format(int(time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description=
        "Integration test experiment from sagemaker-mxnet-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "mxnet-container-integ-test-{}".format(int(time.time()))
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    hyperparameters = {
        "random_seed": True,
        "num_steps": 50,
        "smdebug_path": "/opt/ml/output/tensors",
        "epochs": 1,
    }

    mx = MXNet(
        entry_point=SCRIPT_PATH,
        role="SageMakerRole",
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        image_name=ecr_image,
        hyperparameters=hyperparameters,
    )

    training_job_name = utils.unique_name_from_base("test-mxnet-image")

    # create a training job and wait for it to complete
    with timeout(minutes=15):
        prefix = "mxnet_mnist_gluon_basic_hook_demo/{}".format(
            utils.sagemaker_timestamp())
        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, "train"),
            key_prefix=prefix + "/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, "test"), key_prefix=prefix + "/test")

        mx.fit({
            "train": train_input,
            "test": test_input
        },
               job_name=training_job_name,
               wait=False)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_component_summary = None
    attempts = 0
    while True:
        trial_components = list(
            TrialComponent.list(source_arn=training_job_arn,
                                sagemaker_boto_client=sm_client))

        if len(trial_components) > 0:
            trial_component_summary = trial_components[0]
            break

        if attempts < 10:
            attempts += 1
            sleep(500)

    assert trial_component_summary is not None

    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
Beispiel #5
0
def test_training(sagemaker_session, ecr_image, instance_type,
                  framework_version):

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "tf-container-integ-test-{}".format(int(time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description="Integration test experiment from sagemaker-tf-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "tf-container-integ-test-{}".format(int(time.time()))

    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    training_job_name = utils.unique_name_from_base(
        "test-tf-experiments-mnist")

    # create a training job and wait for it to complete
    with timeout(minutes=DEFAULT_TIMEOUT):
        resource_path = os.path.join(os.path.dirname(__file__), "..", "..",
                                     "resources")
        script = os.path.join(resource_path, "mnist", "mnist.py")
        estimator = TensorFlow(
            entry_point=script,
            role="SageMakerRole",
            train_instance_type=instance_type,
            train_instance_count=1,
            sagemaker_session=sagemaker_session,
            image_name=ecr_image,
            framework_version=framework_version,
            script_mode=True,
        )
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "mnist", "data"),
            key_prefix="scriptmode/mnist")
        estimator.fit(inputs, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
def _test_training_function(ecr_image, sagemaker_session, instance_type,
                            framework_version):
    sm_client = sagemaker_session.sagemaker_client
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_id = random.randint(1, 6000)

    experiment_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description="Integration test experiment from sagemaker-tf-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"

    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    training_job_name = utils.unique_name_from_base(
        "test-tf-experiments-mnist")

    # create a training job and wait for it to complete
    with timeout(minutes=DEFAULT_TIMEOUT):
        resource_path = os.path.join(os.path.dirname(__file__), "..", "..",
                                     "resources")
        script = os.path.join(resource_path, "mnist", "mnist.py")
        estimator = TensorFlow(
            model_dir=False,
            entry_point=script,
            role="SageMakerRole",
            instance_type=instance_type,
            instance_count=1,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
        )
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "mnist", "data"),
            key_prefix="scriptmode/mnist")
        estimator.fit(inputs, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    # Prevent throttling to avoid deleting experiment before it's updated with trial deletion
    time.sleep(1.2)
    experiment.delete()
def test_training(sagemaker_session, ecr_image, instance_type):

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "pytorch-container-integ-test-{}".format(int(
        time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description=
        "Integration test full customer e2e from sagemaker-pytorch-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "pytorch-container-integ-test-{}".format(int(time.time()))
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    hyperparameters = {
        "random_seed": True,
        "num_steps": 50,
        "smdebug_path": "/opt/ml/output/tensors",
        "epochs": 1,
        "data_dir": training_dir,
    }

    training_job_name = utils.unique_name_from_base(
        "test-pytorch-experiments-image")

    # create a training job and wait for it to complete
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point=smdebug_mnist_script,
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            image_name=ecr_image,
            hyperparameters=hyperparameters,
        )
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix="pytorch/mnist")
        pytorch.fit({"training": training_input}, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()