Exemple #1
0
def create_trial(experiment_name, trial_base_name=None):
    '''
    Requires a valid experiment name, optionally takes a base trial name.
        Attempts to create a trial for a new model associated with the pre-created experiment.
    '''
    now = int(time.time())

    if trial_base_name:
        trial_name = "autogluon-{}-{}".format(trial_base_name, now)
    else:
        trial_name = "autogluon-candidate-{}".format(now)

    try:
        client = boto3.Session().client('sagemaker')
    except:
        print('You need to install boto3. Try pip install --upgrade boto3')
        return ''

    try:
        trial = Trial.create(trial_name=trial_name,
                             experiment_name=experiment_name,
                             sagemaker_boto_client=client)
        print('Created a trial named {}'.format(trial_name))
        return trial

    except:
        print(
            'Could not create a trial, was that a valid experiment or trial base name?'
        )

    return ''
def create_trial(Experiment_name, Trial_name):
    try:
        trial = Trial.load(trial_name=Trial_name)
    except Exception as ex:
        if "ResourceNotFound" in str(ex):
            trial = Trial.create(experiment_name=Experiment_name,
                                 trial_name=Trial_name)
Exemple #3
0
def _test_training_function(ecr_image, sagemaker_session, instance_type,
                            framework_version, py_version):
    if py_version is None or '2' in py_version:
        pytest.skip('Skipping python2 {}'.format(py_version))
        return

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_id = random.randint(1, 6000)

    experiment_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description="Integration test experiment from sagemaker-tf-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    training_job_name = utils.unique_name_from_base(
        "test-tf-experiments-mnist")

    # create a training job and wait for it to complete
    with timeout(minutes=15):
        resource_path = os.path.join(os.path.dirname(__file__), "..", "..",
                                     "resources")
        script = os.path.join(resource_path, "mnist", "mnist.py")
        estimator = TensorFlow(
            entry_point=script,
            role="SageMakerRole",
            instance_type=instance_type,
            instance_count=1,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
            script_mode=True,
        )
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "mnist", "data"),
            key_prefix="scriptmode/mnist")
        estimator.fit(inputs, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
Exemple #4
0
def test_training(sagemaker_session, ecr_image, instance_type, instance_count):

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "mxnet-container-integ-test-{}".format(int(time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description=
        "Integration test experiment from sagemaker-mxnet-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "mxnet-container-integ-test-{}".format(int(time.time()))
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    hyperparameters = {
        "random_seed": True,
        "num_steps": 50,
        "smdebug_path": "/opt/ml/output/tensors",
        "epochs": 1,
    }

    mx = MXNet(
        entry_point=SCRIPT_PATH,
        role="SageMakerRole",
        train_instance_count=instance_count,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        image_name=ecr_image,
        hyperparameters=hyperparameters,
    )

    training_job_name = utils.unique_name_from_base("test-mxnet-image")

    # create a training job and wait for it to complete
    with timeout(minutes=15):
        prefix = "mxnet_mnist_gluon_basic_hook_demo/{}".format(
            utils.sagemaker_timestamp())
        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, "train"),
            key_prefix=prefix + "/train")
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(DATA_PATH, "test"), key_prefix=prefix + "/test")

        mx.fit({
            "train": train_input,
            "test": test_input
        },
               job_name=training_job_name,
               wait=False)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_component_summary = None
    attempts = 0
    while True:
        trial_components = list(
            TrialComponent.list(source_arn=training_job_arn,
                                sagemaker_boto_client=sm_client))

        if len(trial_components) > 0:
            trial_component_summary = trial_components[0]
            break

        if attempts < 10:
            attempts += 1
            sleep(500)

    assert trial_component_summary is not None

    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
Exemple #5
0
def test_training(sagemaker_session, ecr_image, instance_type,
                  framework_version):

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "tf-container-integ-test-{}".format(int(time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description="Integration test experiment from sagemaker-tf-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "tf-container-integ-test-{}".format(int(time.time()))

    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    training_job_name = utils.unique_name_from_base(
        "test-tf-experiments-mnist")

    # create a training job and wait for it to complete
    with timeout(minutes=DEFAULT_TIMEOUT):
        resource_path = os.path.join(os.path.dirname(__file__), "..", "..",
                                     "resources")
        script = os.path.join(resource_path, "mnist", "mnist.py")
        estimator = TensorFlow(
            entry_point=script,
            role="SageMakerRole",
            train_instance_type=instance_type,
            train_instance_count=1,
            sagemaker_session=sagemaker_session,
            image_name=ecr_image,
            framework_version=framework_version,
            script_mode=True,
        )
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "mnist", "data"),
            key_prefix="scriptmode/mnist")
        estimator.fit(inputs, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()
Exemple #6
0
def main():  # pragma: no cover
    """The main harness that creates or updates and runs the pipeline.

    Creates or updates the pipeline and runs it.
    """
    parser = argparse.ArgumentParser(
        "Creates or updates and runs the pipeline for the pipeline script.")

    parser.add_argument(
        "-n",
        "--module-name",
        dest="module_name",
        type=str,
        help="The module name of the pipeline to import.",
    )
    parser.add_argument(
        "-kwargs",
        "--kwargs",
        dest="kwargs",
        default=None,
        help=
        "Dict string of keyword arguments for the pipeline generation (if supported)",
    )
    parser.add_argument(
        "-role-arn",
        "--role-arn",
        dest="role_arn",
        type=str,
        help="The role arn for the pipeline service execution role.",
    )
    parser.add_argument(
        "-description",
        "--description",
        dest="description",
        type=str,
        default=None,
        help="The description of the pipeline.",
    )
    parser.add_argument(
        "-tags",
        "--tags",
        dest="tags",
        default=None,
        help=
        """List of dict strings of '[{"Key": "string", "Value": "string"}, ..]'""",
    )
    args = parser.parse_args()

    if args.module_name is None or args.role_arn is None:
        parser.print_help()
        sys.exit(2)
    tags = convert_struct(args.tags)

    try:
        pipeline = get_pipeline_driver(args.module_name, args.kwargs)
        print(
            "###### Creating/updating a SageMaker Pipeline with the following definition:"
        )
        parsed = json.loads(pipeline.definition())
        print(json.dumps(parsed, indent=2, sort_keys=True))

        upsert_response = pipeline.upsert(role_arn=args.role_arn,
                                          description=args.description,
                                          tags=tags)
        print(
            "\n###### Created/Updated SageMaker Pipeline: Response received:")
        print(upsert_response)

        execution = pipeline.start()
        print(
            f"\n###### Execution started with PipelineExecutionArn: {execution.arn}"
        )

        # Now we describe execution instance and list the steps in the execution to find out more about the execution.
        execution_run = execution.describe()
        print(execution_run)

        # Create or Load the 'Experiment'
        try:
            experiment = Experiment.create(
                experiment_name=pipeline.name,
                description='Amazon Customer Reviews BERT Pipeline Experiment')
        except:
            experiment = Experiment.load(experiment_name=pipeline.name)

        print('Experiment name: {}'.format(experiment.experiment_name))

        # Add Execution Run as Trial to Experiments
        execution_run_name = execution_run['PipelineExecutionDisplayName']
        print(execution_run_name)

        # Create the `Trial`
        timestamp = int(time.time())

        trial = Trial.create(trial_name=execution_run_name,
                             experiment_name=experiment.experiment_name,
                             sagemaker_boto_client=sm)

        trial_name = trial.trial_name
        print('Trial name: {}'.format(trial_name))

        ######################################################
        ## Parse Pipeline Definition For Processing Job Args
        ######################################################

        processing_param_dict = {}

        for step in parsed['Steps']:
            print('step: {}'.format(step))
            if step['Name'] == 'Processing':
                print('Step Name is Processing...')
                arg_list = step['Arguments']['AppSpecification'][
                    'ContainerArguments']
                print(arg_list)
                num_args = len(arg_list)
                print(num_args)

                # arguments are (key, value) pairs in this list, so we extract them in pairs
                # using [i] and [i+1] indexes and stepping by 2 through the list
                for i in range(0, num_args, 2):
                    key = arg_list[i].replace('--', '')
                    value = arg_list[i + 1]
                    print('arg key: {}'.format(key))
                    print('arg value: {}'.format(value))
                    processing_param_dict[key] = value

        ##############################
        ## Wait For Execution To Finish
        ##############################

        print("Waiting for the execution to finish...")
        execution.wait()
        print("\n#####Execution completed. Execution step details:")

        # List Execution Steps
        print(execution.list_steps())

        # List All Artifacts Generated By The Pipeline
        processing_job_name = None
        training_job_name = None

        from sagemaker.lineage.visualizer import LineageTableVisualizer

        viz = LineageTableVisualizer(sagemaker.session.Session())
        for execution_step in reversed(execution.list_steps()):
            print(execution_step)
            # We are doing this because there appears to be a bug of this LineageTableVisualizer handling the Processing Step
            if execution_step['StepName'] == 'Processing':
                processing_job_name = execution_step['Metadata'][
                    'ProcessingJob']['Arn'].split('/')[-1]
                print(processing_job_name)
                #display(viz.show(processing_job_name=processing_job_name))
            elif execution_step['StepName'] == 'Train':
                training_job_name = execution_step['Metadata']['TrainingJob'][
                    'Arn'].split('/')[-1]
                print(training_job_name)
                #display(viz.show(training_job_name=training_job_name))
            else:
                #display(viz.show(pipeline_execution_step=execution_step))
                time.sleep(5)

        # Add Trial Compontents To Experiment Trial
        processing_job_tc = '{}-aws-processing-job'.format(processing_job_name)
        print(processing_job_tc)

        # -aws-processing-job is the default name assigned by ProcessingJob
        response = sm.associate_trial_component(
            TrialComponentName=processing_job_tc, TrialName=trial_name)

        # -aws-training-job is the default name assigned by TrainingJob
        training_job_tc = '{}-aws-training-job'.format(training_job_name)
        print(training_job_tc)

        response = sm.associate_trial_component(
            TrialComponentName=training_job_tc, TrialName=trial_name)

        ##############
        # Log Additional Parameters within Trial
        ##############
        print('Logging Processing Job Parameters within Experiment Trial...')
        processing_job_tracker = tracker.Tracker.load(
            trial_component_name=processing_job_tc)

        for key, value in processing_param_dict.items():
            print('key: {}, value: {}'.format(key, value))
            processing_job_tracker.log_parameters({key: str(value)})
            # must save after logging
            processing_job_tracker.trial_component.save()

    except Exception as e:  # pylint: disable=W0703
        print(f"Exception: {e}")
        sys.exit(1)
def _test_training_function(ecr_image, sagemaker_session, instance_type,
                            framework_version):
    sm_client = sagemaker_session.sagemaker_client
    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_id = random.randint(1, 6000)

    experiment_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description="Integration test experiment from sagemaker-tf-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = f"tf-container-integ-test-{unique_id}-{int(time.time())}"

    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    training_job_name = utils.unique_name_from_base(
        "test-tf-experiments-mnist")

    # create a training job and wait for it to complete
    with timeout(minutes=DEFAULT_TIMEOUT):
        resource_path = os.path.join(os.path.dirname(__file__), "..", "..",
                                     "resources")
        script = os.path.join(resource_path, "mnist", "mnist.py")
        estimator = TensorFlow(
            model_dir=False,
            entry_point=script,
            role="SageMakerRole",
            instance_type=instance_type,
            instance_count=1,
            sagemaker_session=sagemaker_session,
            image_uri=ecr_image,
            framework_version=framework_version,
        )
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, "mnist", "data"),
            key_prefix="scriptmode/mnist")
        estimator.fit(inputs, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    # Prevent throttling to avoid deleting experiment before it's updated with trial deletion
    time.sleep(1.2)
    experiment.delete()
def test_training(sagemaker_session, ecr_image, instance_type):

    from smexperiments.experiment import Experiment
    from smexperiments.trial import Trial
    from smexperiments.trial_component import TrialComponent

    sm_client = sagemaker_session.sagemaker_client

    experiment_name = "pytorch-container-integ-test-{}".format(int(
        time.time()))

    experiment = Experiment.create(
        experiment_name=experiment_name,
        description=
        "Integration test full customer e2e from sagemaker-pytorch-container",
        sagemaker_boto_client=sm_client,
    )

    trial_name = "pytorch-container-integ-test-{}".format(int(time.time()))
    trial = Trial.create(experiment_name=experiment_name,
                         trial_name=trial_name,
                         sagemaker_boto_client=sm_client)

    hyperparameters = {
        "random_seed": True,
        "num_steps": 50,
        "smdebug_path": "/opt/ml/output/tensors",
        "epochs": 1,
        "data_dir": training_dir,
    }

    training_job_name = utils.unique_name_from_base(
        "test-pytorch-experiments-image")

    # create a training job and wait for it to complete
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(
            entry_point=smdebug_mnist_script,
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            image_name=ecr_image,
            hyperparameters=hyperparameters,
        )
        training_input = pytorch.sagemaker_session.upload_data(
            path=training_dir, key_prefix="pytorch/mnist")
        pytorch.fit({"training": training_input}, job_name=training_job_name)

    training_job = sm_client.describe_training_job(
        TrainingJobName=training_job_name)
    training_job_arn = training_job["TrainingJobArn"]

    # verify trial component auto created from the training job
    trial_components = list(
        TrialComponent.list(source_arn=training_job_arn,
                            sagemaker_boto_client=sm_client))

    trial_component_summary = trial_components[0]
    trial_component = TrialComponent.load(
        trial_component_name=trial_component_summary.trial_component_name,
        sagemaker_boto_client=sm_client,
    )

    # associate the trial component with the trial
    trial.add_trial_component(trial_component)

    # cleanup
    trial.remove_trial_component(trial_component_summary.trial_component_name)
    trial_component.delete()
    trial.delete()
    experiment.delete()