Ejemplo n.º 1
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)

    logging.getLogger().setLevel(logging.INFO)
    client = _utils.get_sagemaker_client(args.region)
    logging.info(
        'Submitting HyperParameter Tuning Job request to SageMaker...')
    hpo_job_name = _utils.create_hyperparameter_tuning_job(client, vars(args))
    logging.info(
        'HyperParameter Tuning Job request submitted. Waiting for completion...'
    )
    _utils.wait_for_hyperparameter_training_job(client, hpo_job_name)
    best_job, best_hyperparameters = _utils.get_best_training_job_and_hyperparameters(
        client, hpo_job_name)
    model_artifact_url = _utils.get_model_artifacts_from_job(client, best_job)
    image = _utils.get_image_from_job(client, best_job)

    logging.info('HyperParameter Tuning Job completed.')

    _utils.write_output(args.hpo_job_name_output_path, hpo_job_name)
    _utils.write_output(args.model_artifact_url_output_path,
                        model_artifact_url)
    _utils.write_output(args.best_job_name_output_path, best_job)
    _utils.write_output(args.best_hyperparameters_output_path,
                        best_hyperparameters,
                        json_encode=True)
    _utils.write_output(args.training_image_output_path, image)
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)

    logging.getLogger().setLevel(logging.INFO)
    client = _utils.get_sagemaker_client(args.region)
    logging.info(
        'Submitting HyperParameter Tuning Job request to SageMaker...')
    hpo_job_name = _utils.create_hyperparameter_tuning_job(client, vars(args))
    logging.info(
        'HyperParameter Tuning Job request submitted. Waiting for completion...'
    )
    _utils.wait_for_hyperparameter_training_job(client, hpo_job_name)
    best_job, best_hyperparameters = _utils.get_best_training_job_and_hyperparameters(
        client, hpo_job_name)
    model_artifact_url = _utils.get_model_artifacts_from_job(client, best_job)
    image = _utils.get_image_from_job(client, best_job)

    logging.info('HyperParameter Tuning Job completed.')

    with open('/tmp/hpo_job_name.txt', 'w') as f:
        f.write(hpo_job_name)
    with open('/tmp/best_job_name.txt', 'w') as f:
        f.write(best_job)
    with open('/tmp/best_hyperparameters.txt', 'w') as f:
        f.write(json.dumps(best_hyperparameters))
    with open('/tmp/model_artifact_url.txt', 'w') as f:
        f.write(model_artifact_url)
    with open('/tmp/training_image.txt', 'w') as f:
        f.write(image)
Ejemplo n.º 3
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)

    logging.getLogger().setLevel(logging.INFO)
    client = _utils.get_sagemaker_client(args.region, args.endpoint_url)

    logging.info('Submitting Training Job to SageMaker...')
    job_name = _utils.create_training_job(client, vars(args))
    logging.info('Job request submitted. Waiting for completion...')
    _utils.wait_for_training_job(client, job_name)

    image = _utils.get_image_from_job(client, job_name)
    model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name)
    logging.info('Get model artifacts %s from training job %s.',
                 model_artifact_url, job_name)

    with open('/tmp/model_artifact_url.txt', 'w') as f:
        f.write(model_artifact_url)
    with open('/tmp/job_name.txt', 'w') as f:
        f.write(job_name)
    with open('/tmp/training_image.txt', 'w') as f:
        f.write(image)

    logging.info('Job completed.')
Ejemplo n.º 4
0
def main(argv=None):
  parser = create_parser()
  args = parser.parse_args(argv)

  logging.getLogger().setLevel(logging.INFO)
  client = _utils.get_sagemaker_client(args.region, args.endpoint_url)

  logging.info('Submitting Training Job to SageMaker...')
  job_name = _utils.create_training_job(client, vars(args))

  def signal_term_handler(signalNumber, frame):
    _utils.stop_training_job(client, job_name)
    logging.info(f"Training Job: {job_name} request submitted to Stop")
  signal.signal(signal.SIGTERM, signal_term_handler)

  logging.info('Job request submitted. Waiting for completion...')
  try:
    _utils.wait_for_training_job(client, job_name)
    _utils.wait_for_debug_rules(client, job_name)
  except:
    raise
  finally:
    cw_client = _utils.get_cloudwatch_client(args.region)
    _utils.print_logs_for_job(cw_client, '/aws/sagemaker/TrainingJobs', job_name)

  image = _utils.get_image_from_job(client, job_name)
  model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name)
  logging.info('Get model artifacts %s from training job %s.', model_artifact_url, job_name)

  _utils.write_output(args.model_artifact_url_output_path, model_artifact_url)
  _utils.write_output(args.job_name_output_path, job_name)
  _utils.write_output(args.training_image_output_path, image)

  logging.info('Job completed.')
Ejemplo n.º 5
0
    def test_get_image_from_defined_job(self):
        mock_client = MagicMock()
        mock_client.describe_training_job.return_value = {
            "AlgorithmSpecification": {
                "TrainingImage": "training-image-url"
            }
        }

        self.assertEqual(
            _utils.get_image_from_job(mock_client, 'training-job'),
            "training-image-url")
Ejemplo n.º 6
0
    def test_get_image_from_algorithm_job(self):
        mock_client = MagicMock()
        mock_client.describe_hyper_parameter_tuning_job.return_value = {
            "TrainingJobDefinition": {
                "AlgorithmSpecification": {
                    "AlgorithmName": "my-algorithm"
                }
            }
        }
        mock_client.describe_algorithm.return_value = {
            "TrainingSpecification": {
                "TrainingImage": "training-image-url"
            }
        }

        self.assertEqual(
            _utils.get_image_from_job(mock_client, 'training-job'),
            "training-image-url")
Ejemplo n.º 7
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)

    logging.getLogger().setLevel(logging.INFO)
    client = _utils.get_sagemaker_client(args.region,
                                         args.endpoint_url,
                                         assume_role_arn=args.assume_role)
    logging.info(
        'Submitting HyperParameter Tuning Job request to SageMaker...')
    hpo_job_name = _utils.create_hyperparameter_tuning_job(client, vars(args))

    def signal_term_handler(signalNumber, frame):
        _utils.stop_hyperparameter_tuning_job(client, hpo_job_name)
        logging.info(
            f"HyperParameter Tuning Job: {hpo_job_name} request submitted to Stop"
        )

    signal.signal(signal.SIGTERM, signal_term_handler)

    logging.info(
        'HyperParameter Tuning Job request submitted. Waiting for completion...'
    )
    _utils.wait_for_hyperparameter_training_job(client, hpo_job_name)
    best_job, best_hyperparameters = _utils.get_best_training_job_and_hyperparameters(
        client, hpo_job_name)
    model_artifact_url = _utils.get_model_artifacts_from_job(client, best_job)
    image = _utils.get_image_from_job(client, best_job)

    logging.info('HyperParameter Tuning Job completed.')

    _utils.write_output(args.hpo_job_name_output_path, hpo_job_name)
    _utils.write_output(args.model_artifact_url_output_path,
                        model_artifact_url)
    _utils.write_output(args.best_job_name_output_path, best_job)
    _utils.write_output(args.best_hyperparameters_output_path,
                        best_hyperparameters,
                        json_encode=True)
    _utils.write_output(args.training_image_output_path, image)
def main(argv=None):
    parser = argparse.ArgumentParser(
        description='SageMaker Hyperparameter Tuning Job')
    parser.add_argument('--region',
                        type=str.strip,
                        required=True,
                        help='The region where the cluster launches.')
    parser.add_argument(
        '--job_name',
        type=str.strip,
        required=False,
        help=
        'The name of the tuning job. Must be unique within the same AWS account and AWS region.'
    )
    parser.add_argument(
        '--role',
        type=str.strip,
        required=True,
        help=
        'The Amazon Resource Name (ARN) that Amazon SageMaker assumes to perform tasks on your behalf.'
    )
    parser.add_argument(
        '--image',
        type=str.strip,
        required=False,
        help=
        'The registry path of the Docker image that contains the training algorithm.',
        default='')
    parser.add_argument(
        '--algorithm_name',
        type=str.strip,
        required=False,
        help=
        'The name of the resource algorithm to use for the hyperparameter tuning job.',
        default='')
    parser.add_argument(
        '--training_input_mode',
        choices=['File', 'Pipe'],
        type=str.strip,
        required=False,
        help='The input mode that the algorithm supports. File or Pipe.',
        default='File')
    parser.add_argument(
        '--metric_definitions',
        type=_utils.str_to_json_dict,
        required=False,
        help=
        'The dictionary of name-regex pairs specify the metrics that the algorithm emits.',
        default='{}')
    parser.add_argument(
        '--strategy',
        choices=['Bayesian', 'Random'],
        type=str.strip,
        required=False,
        help=
        'How hyperparameter tuning chooses the combinations of hyperparameter values to use for the training job it launches.',
        default='Bayesian')
    parser.add_argument(
        '--metric_name',
        type=str.strip,
        required=True,
        help='The name of the metric to use for the objective metric.')
    parser.add_argument(
        '--metric_type',
        choices=['Maximize', 'Minimize'],
        type=str.strip,
        required=True,
        help='Whether to minimize or maximize the objective metric.')
    parser.add_argument(
        '--early_stopping_type',
        choices=['Off', 'Auto'],
        type=str.strip,
        required=False,
        help='Whether to minimize or maximize the objective metric.',
        default='Off')
    parser.add_argument(
        '--static_parameters',
        type=_utils.str_to_json_dict,
        required=False,
        help=
        'The values of hyperparameters that do not change for the tuning job.',
        default='{}')
    parser.add_argument(
        '--integer_parameters',
        type=_utils.str_to_json_list,
        required=False,
        help=
        'The array of IntegerParameterRange objects that specify ranges of integer hyperparameters that you want to search.',
        default='[]')
    parser.add_argument(
        '--continuous_parameters',
        type=_utils.str_to_json_list,
        required=False,
        help=
        'The array of ContinuousParameterRange objects that specify ranges of continuous hyperparameters that you want to search.',
        default='[]')
    parser.add_argument(
        '--categorical_parameters',
        type=_utils.str_to_json_list,
        required=False,
        help=
        'The array of CategoricalParameterRange objects that specify ranges of categorical hyperparameters that you want to search.',
        default='[]')
    parser.add_argument(
        '--channels',
        type=_utils.str_to_json_list,
        required=True,
        help=
        'A list of dicts specifying the input channels. Must have at least one.'
    )
    parser.add_argument(
        '--data_location_1',
        type=str.strip,
        required=False,
        help='The S3 URI of the input data source for channel 1.',
        default='')
    parser.add_argument(
        '--data_location_2',
        type=str.strip,
        required=False,
        help='The S3 URI of the input data source for channel 2.',
        default='')
    parser.add_argument(
        '--data_location_3',
        type=str.strip,
        required=False,
        help='The S3 URI of the input data source for channel 3.',
        default='')
    parser.add_argument(
        '--data_location_4',
        type=str.strip,
        required=False,
        help='The S3 URI of the input data source for channel 4.',
        default='')
    parser.add_argument(
        '--data_location_5',
        type=str.strip,
        required=False,
        help='The S3 URI of the input data source for channel 5.',
        default='')
    parser.add_argument(
        '--data_location_6',
        type=str.strip,
        required=False,
        help='The S3 URI of the input data source for channel 6.',
        default='')
    parser.add_argument(
        '--data_location_7',
        type=str.strip,
        required=False,
        help='The S3 URI of the input data source for channel 7.',
        default='')
    parser.add_argument(
        '--data_location_8',
        type=str.strip,
        required=False,
        help='The S3 URI of the input data source for channel 8.',
        default='')
    parser.add_argument(
        '--output_location',
        type=str.strip,
        required=True,
        help=
        'The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job.'
    )
    parser.add_argument(
        '--output_encryption_key',
        type=str.strip,
        required=False,
        help=
        'The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts.',
        default='')
    parser.add_argument(
        '--instance_type',
        choices=[
            'ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge',
            'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge',
            'ml.m5.4xlarge', 'ml.m5.12xlarge', 'ml.m5.24xlarge',
            'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge',
            'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge',
            'ml.p3.8xlarge', 'ml.p3.16xlarge', 'ml.c5.xlarge', 'ml.c5.2xlarge',
            'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'
        ],
        type=str.strip,
        required=False,
        help='The ML compute instance type.',
        default='ml.m4.xlarge')
    parser.add_argument(
        '--instance_count',
        type=_utils.str_to_int,
        required=False,
        help='The number of ML compute instances to use in each training job.',
        default=1)
    parser.add_argument(
        '--volume_size',
        type=_utils.str_to_int,
        required=False,
        help='The size of the ML storage volume that you want to provision.',
        default=1)
    parser.add_argument(
        '--max_num_jobs',
        type=_utils.str_to_int,
        required=True,
        help=
        'The maximum number of training jobs that a hyperparameter tuning job can launch.'
    )
    parser.add_argument(
        '--max_parallel_jobs',
        type=_utils.str_to_int,
        required=True,
        help=
        'The maximum number of concurrent training jobs that a hyperparameter tuning job can launch.'
    )
    parser.add_argument(
        '--max_run_time',
        type=_utils.str_to_int,
        required=False,
        help='The maximum run time in seconds per training job.',
        default=86400)
    parser.add_argument(
        '--resource_encryption_key',
        type=str.strip,
        required=False,
        help=
        'The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).',
        default='')
    parser.add_argument(
        '--vpc_security_group_ids',
        type=str.strip,
        required=False,
        help='The VPC security group IDs, in the form sg-xxxxxxxx.')
    parser.add_argument(
        '--vpc_subnets',
        type=str.strip,
        required=False,
        help=
        'The ID of the subnets in the VPC to which you want to connect your hpo job.'
    )
    parser.add_argument('--network_isolation',
                        type=_utils.str_to_bool,
                        required=False,
                        help='Isolates the training container.',
                        default=True)
    parser.add_argument(
        '--traffic_encryption',
        type=_utils.str_to_bool,
        required=False,
        help=
        'Encrypts all communications between ML compute instances in distributed training.',
        default=False)
    parser.add_argument(
        '--warm_start_type',
        choices=['IdenticalDataAndAlgorithm', 'TransferLearning', ''],
        type=str.strip,
        required=False,
        help=
        'Specifies either "IdenticalDataAndAlgorithm" or "TransferLearning"')
    parser.add_argument(
        '--parent_hpo_jobs',
        type=str.strip,
        required=False,
        help=
        'List of previously completed or stopped hyperparameter tuning jobs to be used as a starting point.',
        default='')
    parser.add_argument(
        '--tags',
        type=_utils.str_to_json_dict,
        required=False,
        help='An array of key-value pairs, to categorize AWS resources.',
        default='{}')

    args = parser.parse_args()

    logging.getLogger().setLevel(logging.INFO)
    client = _utils.get_client(args.region)
    logging.info(
        'Submitting HyperParameter Tuning Job request to SageMaker...')
    hpo_job_name = _utils.create_hyperparameter_tuning_job(client, vars(args))
    logging.info(
        'HyperParameter Tuning Job request submitted. Waiting for completion...'
    )
    _utils.wait_for_hyperparameter_training_job(client, hpo_job_name)
    best_job, best_hyperparameters = _utils.get_best_training_job_and_hyperparameters(
        client, hpo_job_name)
    model_artifact_url = _utils.get_model_artifacts_from_job(client, best_job)
    image = _utils.get_image_from_job(client, best_job)

    logging.info('HyperParameter Tuning Job completed.')

    with open('/tmp/best_job_name.txt', 'w') as f:
        f.write(best_job)
    with open('/tmp/best_hyperparameters.txt', 'w') as f:
        f.write(json.dumps(best_hyperparameters))
    with open('/tmp/model_artifact_url.txt', 'w') as f:
        f.write(model_artifact_url)
    with open('/tmp/training_image.txt', 'w') as f:
        f.write(image)
Ejemplo n.º 9
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='SageMaker Training Job')
    parser.add_argument('--region',
                        type=str.strip,
                        required=True,
                        help='The region where the training job launches.')
    parser.add_argument('--job_name',
                        type=str.strip,
                        required=False,
                        help='The name of the training job.',
                        default='')
    parser.add_argument(
        '--role',
        type=str.strip,
        required=True,
        help=
        'The Amazon Resource Name (ARN) that Amazon SageMaker assumes to perform tasks on your behalf.'
    )
    parser.add_argument(
        '--image',
        type=str.strip,
        required=True,
        help=
        'The registry path of the Docker image that contains the training algorithm.',
        default='')
    parser.add_argument(
        '--algorithm_name',
        type=str.strip,
        required=False,
        help='The name of the resource algorithm to use for the training job.',
        default='')
    parser.add_argument(
        '--metric_definitions',
        type=_utils.str_to_json_dict,
        required=False,
        help=
        'The dictionary of name-regex pairs specify the metrics that the algorithm emits.',
        default='{}')
    parser.add_argument(
        '--training_input_mode',
        choices=['File', 'Pipe'],
        type=str.strip,
        help='The input mode that the algorithm supports. File or Pipe.',
        default='File')
    parser.add_argument(
        '--hyperparameters',
        type=_utils.str_to_json_dict,
        help='Dictionary of hyperparameters for the the algorithm.',
        default='{}')
    parser.add_argument(
        '--channels',
        type=_utils.str_to_json_list,
        required=True,
        help=
        'A list of dicts specifying the input channels. Must have at least one.'
    )
    parser.add_argument(
        '--instance_type',
        required=True,
        choices=[
            'ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge',
            'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge',
            'ml.m5.4xlarge', 'ml.m5.12xlarge', 'ml.m5.24xlarge',
            'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge',
            'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge',
            'ml.p3.8xlarge', 'ml.p3.16xlarge', 'ml.c5.xlarge', 'ml.c5.2xlarge',
            'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'
        ],
        type=str.strip,
        help='The ML compute instance type.',
        default='ml.m4.xlarge')
    parser.add_argument(
        '--instance_count',
        required=True,
        type=_utils.str_to_int,
        help=
        'The registry path of the Docker image that contains the training algorithm.',
        default=1)
    parser.add_argument(
        '--volume_size',
        type=_utils.str_to_int,
        required=True,
        help='The size of the ML storage volume that you want to provision.',
        default=1)
    parser.add_argument(
        '--resource_encryption_key',
        type=str.strip,
        required=False,
        help=
        'The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).',
        default='')
    parser.add_argument(
        '--max_run_time',
        type=_utils.str_to_int,
        required=True,
        help='The maximum run time in seconds for the training job.',
        default=86400)
    parser.add_argument(
        '--model_artifact_path',
        type=str.strip,
        required=True,
        help=
        'Identifies the S3 path where you want Amazon SageMaker to store the model artifacts.'
    )
    parser.add_argument(
        '--output_encryption_key',
        type=str.strip,
        required=False,
        help=
        'The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts.',
        default='')
    parser.add_argument(
        '--vpc_security_group_ids',
        type=str.strip,
        required=False,
        help='The VPC security group IDs, in the form sg-xxxxxxxx.')
    parser.add_argument(
        '--vpc_subnets',
        type=str.strip,
        required=False,
        help=
        'The ID of the subnets in the VPC to which you want to connect your hpo job.'
    )
    parser.add_argument('--network_isolation',
                        type=_utils.str_to_bool,
                        required=False,
                        help='Isolates the training container.',
                        default=True)
    parser.add_argument(
        '--traffic_encryption',
        type=_utils.str_to_bool,
        required=False,
        help=
        'Encrypts all communications between ML compute instances in distributed training.',
        default=False)
    parser.add_argument(
        '--tags',
        type=_utils.str_to_json_dict,
        required=False,
        help='An array of key-value pairs, to categorize AWS resources.',
        default='{}')
    args = parser.parse_args()

    logging.getLogger().setLevel(logging.INFO)
    client = _utils.get_client(args.region)

    logging.info('Submitting Training Job to SageMaker...')
    job_name = _utils.create_training_job(client, vars(args))
    logging.info('Job request submitted. Waiting for completion...')
    _utils.wait_for_training_job(client, job_name)

    image = _utils.get_image_from_job(client, job_name)
    model_artifact_url = _utils.get_model_artifacts_from_job(client, job_name)
    logging.info('Get model artifacts %s from training job %s.',
                 model_artifact_url, job_name)

    with open('/tmp/model_artifact_url.txt', 'w') as f:
        f.write(model_artifact_url)
    with open('/tmp/job_name.txt', 'w') as f:
        f.write(job_name)
    with open('/tmp/training_image.txt', 'w') as f:
        f.write(image)

    logging.info('Job completed.')