def _fit_first_model(self, input_data_s3_prefix=None, manifest_file_path=None, wait=False, logs=True): """ A Estimator fit() call to initiate the first model of the experiment """ rl_estimator_args = self._get_rl_estimator_args() self.rl_estimator = RLEstimator(**rl_estimator_args) if manifest_file_path: input_data = sagemaker.session.s3_input( s3_data=manifest_file_path, input_mode="File", s3_data_type="ManifestFile") self.rl_estimator.fit(job_name=self.model_id, inputs=input_data, wait=wait, logs=logs) else: self.rl_estimator.fit(job_name=self.model_id, inputs=input_data_s3_prefix, wait=wait, logs=logs)
def fit( self, input_model_id=None, input_data_s3_prefix=None, manifest_file_path=None, wait=False, logs=True ): """A Estimator fit() call to start a model training job. Args: input_model_id (str): Model id of model to used as pre-trained model of the training job input_data_s3_prefix (str): Defines the location of s3 data to train on. manifest_file_path (str): Manifest file used to provide training data. wait (bool): Whether the call should wait until the job completes. Only meaningful when running in SageMaker mode. logs (bool): Whether to show the logs produced by the job. Only meaningful when wait is True (default: True). """ # update object var, to be reflected in DDb Record as well. self.model_record.add_new_training_job_info( input_model_id=input_model_id, input_data_s3_prefix=input_data_s3_prefix, manifest_file_path=manifest_file_path ) self.model_db_client.update_model_record(self._jsonify()) if input_model_id is None: self._fit_first_model( input_data_s3_prefix=input_data_s3_prefix, manifest_file_path=manifest_file_path, wait=wait, logs=logs) else: # use 'input_model_id' as pretrained model for training input_model_record = self.model_db_client.get_model_record( self.experiment_id, input_model_id ) model_artifact_path = input_model_record.get("s3_model_output_path") rl_estimator_args = self._get_rl_estimator_args() rl_estimator_args['model_channel_name'] = 'pretrained_model' rl_estimator_args['model_uri'] = model_artifact_path self.rl_estimator = RLEstimator(**rl_estimator_args) if manifest_file_path: inputs = sagemaker.session.s3_input( s3_data=manifest_file_path, s3_data_type='ManifestFile' ) else: inputs = input_data_s3_prefix self.rl_estimator.fit( job_name=self.model_id, inputs=inputs, wait=wait, logs=logs )
class ModelManager: """A model entity with the given experiment. This class will handle the model creation, model training, model evaluation and model metadata management. """ def __init__( self, model_db_client: ModelDbClient, experiment_id, model_id, image=None, role=None, instance_config={}, boto_session=None, algor_config={}, train_state=None, evaluation_job_name=None, eval_state=None, eval_scores={}, input_model_id=None, rl_estimator=None, input_data_s3_prefix=None, manifest_file_path=None, eval_data_s3_path=None, s3_model_output_path=None, training_start_time=None, training_end_time=None, ): """Initialize a model entity in the current experiment Args: model_db_client (ModelDBClient): A DynamoDB client to query the model table. The 'Model' entity use this client to read/update the model state. experiment_id (str): A unique id for the experiment. The created/loaded model will be associated with the given experiment. model_id (str): Aa unique id for the model. The model table uses model id to manage associated model metadata. image (str): The container image to use for training/evaluation. role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs will use this role to access AWS resources. instance_config (dict): A dictionary that specify the resource configuration for the model training/evaluation job. boto_session (boto3.session.Session): A session stores configuration state and allows you to create service clients and resources. algor_config (dict): A dictionary that specify the algorithm type and hyper parameters of the training/evaluation job. train_state (str): State of the model training job. evaluation_job_name (str): Job name for Latest Evaluation Job for this model eval_state (str): State of the model evaluation job. input_model_id (str): A unique model id to specify which model to use as a pre-trained model for the model training job. rl_estimator (sagemaker.rl.estimator.RLEstimator): A Sagemaker RLEstimator entity that handle Reinforcement Learning (RL) execution within a SageMaker Training Job. input_data_s3_prefix (str): Input data path for the data source of the model training job. s3_model_output_path (str): Output data path of model artifact for the model training job. training_start_time (str): Starting timestamp of the model training job. training_end_time (str): Finished timestamp of the model training job. Returns: orchestrator.model_manager.ModelManager: A ``Model`` object associated with the given experiment. """ self.model_db_client = model_db_client self.experiment_id = experiment_id self.model_id = model_id # Currently we are not storing image/role and other model params in ModelDb self.image = image self.role = role self.instance_config = instance_config self.algor_config = algor_config # load configs self.instance_type = self.instance_config.get("instance_type", "local") self.instance_count = self.instance_config.get("instance_count", 1) self.algor_params = self.algor_config.get("algorithms_parameters", {}) # create a local ModelRecord object. self.model_record = ModelRecord( experiment_id, model_id, train_state, evaluation_job_name, eval_state, eval_scores, input_model_id, input_data_s3_prefix, manifest_file_path, eval_data_s3_path, s3_model_output_path, training_start_time, training_end_time, ) # try to save this record file. if it throws RecordAlreadyExistsException # reload the record from ModelDb, and recreate try: self.model_db_client.create_new_model_record( self.model_record.to_ddb_record()) except RecordAlreadyExistsException: logger.debug("Model already exists. Reloading from model record.") model_record = self.model_db_client.get_model_record( experiment_id, model_id) self.model_record = ModelRecord.load_from_ddb_record(model_record) except Exception as e: logger.error("Unhandled Exception! " + str(e)) raise UnhandledWorkflowException( "Something went wrong while creating a new model") if boto_session is None: boto_session = boto3.Session() self.boto_session = boto_session if self.instance_type == "local": self.sagemaker_session = LocalSession() else: self.sagemaker_session = sagemaker.session.Session( self.boto_session) self.sagemaker_client = self.sagemaker_session.sagemaker_client def _jsonify(self): """Return a JSON Dict with metadata of the ModelManager Object stored in self.model_record """ return self.model_record.to_ddb_record() @classmethod def name_next_model(cls, experiment_id): """Generate unique model id of a new model in the experiment Args: experiment_id (str): A unique id for the experiment. The created/loaded model will be associated with the given experiment. Returns: str: A unique id for a new model """ return experiment_id + "-model-id-" + str(int(time.time())) def _get_rl_estimator_args(self, eval=False): """Get required args to be used by RLEstimator class Args: eval (boolean): Boolean value to tell if the estimator is running a training/evaluation job. Return: dict: RLEstimator args used to trigger a SageMaker training job """ entry_point = "eval-cfa-vw.py" if eval else "train-vw.py" estimator_type = "Evaluation" if eval else "Training" job_types = "evaluation_jobs" if eval else "training_jobs" sagemaker_bucket = self.sagemaker_session.default_bucket() output_path = f"s3://{sagemaker_bucket}/{self.experiment_id}/{job_types}/" metric_definitions = [{ "Name": "average_loss", "Regex": "average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$" }] args = dict( entry_point=entry_point, source_dir="src", dependencies=["common/sagemaker_rl"], image_uri=self.image, role=self.role, sagemaker_session=self.sagemaker_session, instance_type=self.instance_type, instance_count=self.instance_count, metric_definitions=metric_definitions, hyperparameters=self.algor_params, output_path=output_path, code_location=output_path.strip("/"), ) if self.instance_type == "local": logger.info( f"{estimator_type} job will be executed in 'local' mode") else: logger.info( f"{estimator_type} job will be executed in 'SageMaker' mode") return args def _fit_first_model(self, input_data_s3_prefix=None, manifest_file_path=None, wait=False, logs=True): """ A Estimator fit() call to initiate the first model of the experiment """ rl_estimator_args = self._get_rl_estimator_args() self.rl_estimator = RLEstimator(**rl_estimator_args) if manifest_file_path: input_data = sagemaker.session.s3_input( s3_data=manifest_file_path, input_mode="File", s3_data_type="ManifestFile") self.rl_estimator.fit(job_name=self.model_id, inputs=input_data, wait=wait, logs=logs) else: self.rl_estimator.fit(job_name=self.model_id, inputs=input_data_s3_prefix, wait=wait, logs=logs) def fit(self, input_model_id=None, input_data_s3_prefix=None, manifest_file_path=None, wait=False, logs=True): """A Estimator fit() call to start a model training job. Args: input_model_id (str): Model id of model to used as pre-trained model of the training job input_data_s3_prefix (str): Defines the location of s3 data to train on. manifest_file_path (str): Manifest file used to provide training data. wait (bool): Whether the call should wait until the job completes. Only meaningful when running in SageMaker mode. logs (bool): Whether to show the logs produced by the job. Only meaningful when wait is True (default: True). """ # update object var, to be reflected in DDb Record as well. self.model_record.add_new_training_job_info( input_model_id=input_model_id, input_data_s3_prefix=input_data_s3_prefix, manifest_file_path=manifest_file_path, ) self.model_db_client.update_model_record(self._jsonify()) if input_model_id is None: self._fit_first_model(input_data_s3_prefix=input_data_s3_prefix, manifest_file_path=manifest_file_path, wait=wait, logs=logs) else: # use 'input_model_id' as pretrained model for training input_model_record = self.model_db_client.get_model_record( self.experiment_id, input_model_id) model_artifact_path = input_model_record.get( "s3_model_output_path") rl_estimator_args = self._get_rl_estimator_args() rl_estimator_args["model_channel_name"] = "pretrained_model" rl_estimator_args["model_uri"] = model_artifact_path self.rl_estimator = RLEstimator(**rl_estimator_args) if manifest_file_path: inputs = sagemaker.session.s3_input( s3_data=manifest_file_path, s3_data_type="ManifestFile") else: inputs = input_data_s3_prefix self.rl_estimator.fit(job_name=self.model_id, inputs=inputs, wait=wait, logs=logs) def evaluate( self, input_data_s3_prefix=None, manifest_file_path=None, evaluation_job_name=None, local_mode=True, wait=False, logs=True, ): """A Estimator fit() call to start a model evaluation job. Args: input_data_s3_prefix (str): Defines the location of s3 data used for evaluation manifest_file_path (str): Manifest file used to provide evaluation data. evaluation_job_name (str): Unique Sagemaker job name to identify the evaluation job local_mode (bool): Whether the evaluation job is running on local mode wait (bool): Whether the call should wait until the job completes. Only meaningful when running in SageMaker mode. logs (bool): Whether to show the logs produced by the job. Only meaningful when wait is True. """ # use self.model_id, self._s3_model_output_path as the model to evaluate # Model object has already been initialized with up-to-date DDb record. model_artifact_path = self.model_record.get_model_artifact_path() rl_estimator_args = self._get_rl_estimator_args(eval=True) rl_estimator_args["model_channel_name"] = "pretrained_model" rl_estimator_args["model_uri"] = model_artifact_path if manifest_file_path: inputs = sagemaker.session.s3_input(s3_data=manifest_file_path, s3_data_type="ManifestFile") if local_mode: rl_estimator_args["hyperparameters"].update( {"local_mode_manifest": True}) else: inputs = input_data_s3_prefix # (dict[str, str] or dict[str, sagemaker.session.s3_input]) for evaluation channel eval_channel_inputs = {EVAL_CHANNEL: inputs} self.rl_estimator = RLEstimator(**rl_estimator_args) # update to save eval_data_s3_path in DDb as well, or # update to read from SM describe call... maybe will not work in local mode but. eval_data_s3_path = manifest_file_path if ( manifest_file_path is not None) else input_data_s3_prefix # we keep eval job state as pending, before the SM job has been submitted. # the syncer function should update this state, based on SM job status. self.model_record.add_new_evaluation_job_info( evaluation_job_name=evaluation_job_name, eval_data_s3_path=eval_data_s3_path) self.model_db_client.update_model_record(self._jsonify()) # The following local variables (unsaved to DDb) make evaluation job non-resumable. self.log_output = None self.local_mode = local_mode if local_mode: # Capture eval score by regex expression # log should contain only one "average loss = some number" pattern with CaptureStdout() as log_output: self.rl_estimator.fit(job_name=evaluation_job_name, inputs=eval_channel_inputs, wait=wait, logs=logs) self.log_output = "\n".join(log_output) logger.debug(self.log_output) else: self.rl_estimator.fit(job_name=evaluation_job_name, inputs=eval_channel_inputs, wait=wait, logs=logs) def update_model_training_state(self): self._update_model_table_training_states() def update_model_evaluation_state(self): self._update_model_table_evaluation_states() def _update_model_table_training_states(self): """ Update the training states in the model table. This method will poll the Sagemaker training job and then update training job metadata of the model, including: train_state, s3_model_output_path, training_start_time, training_end_time Args: model_record (dict): Current model record in the model table """ if self.model_record.model_in_terminal_state(): # model already in one of the final states # need not do anything. self.model_db_client.update_model_record(self._jsonify()) return self._jsonify() # Else, try and fetch updated SageMaker TrainingJob status sm_job_info = {} max_describe_retries = 100 sleep_between_describe_retries = 10 for i in range(max_describe_retries): try: sm_job_info = self.sagemaker_client.describe_training_job( TrainingJobName=self.model_id) except Exception as e: if "ValidationException" in str(e): if i > max_describe_retries: # max attempts for DescribeTrainingJob. Fail with ValidationException logger.warn( f"Looks like SageMaker Job was not submitted successfully." f" Failing Training Job with ModelId {self.model_id}" ) self.model_record.update_model_as_failed() self.model_db_client.update_model_as_failed( self._jsonify()) return else: time.sleep(sleep_between_describe_retries) continue else: # Do not raise exception, most probably throttling. logger.warn( f"Failed to check SageMaker Training Job state for ModelId {self.model_id}." " This exception will be ignored, and retried.") logger.debug(e) time.sleep(sleep_between_describe_retries) return self._jsonify() train_state = sm_job_info.get("TrainingJobStatus", "Pending") training_start_time = sm_job_info.get("TrainingStartTime", None) training_end_time = sm_job_info.get("TrainingEndTime", None) if training_start_time is not None: training_start_time = training_start_time.strftime( "%Y-%m-%d %H:%M:%S") if training_end_time is not None: training_end_time = training_end_time.strftime("%Y-%m-%d %H:%M:%S") model_artifacts = sm_job_info.get("ModelArtifacts", None) if model_artifacts is not None: s3_model_output_path = model_artifacts.get("S3ModelArtifacts", None) else: s3_model_output_path = None self.model_record.update_model_job_status(training_start_time, training_end_time, train_state, s3_model_output_path) self.model_db_client.update_model_job_state(self._jsonify()) def _update_model_table_evaluation_states(self): """Update the evaluation states in the model table. This method will poll the Sagemaker evaluation job and then update evaluation job metadata of the model, including: eval_state, eval_scores Args: model_record (dict): Current model record in the model table """ if self.model_record.eval_in_terminal_state(): self.model_db_client.update_model_record(self._jsonify()) return self._jsonify() # Try and fetch updated SageMaker Training Job Status sm_eval_job_info = {} max_describe_retries = 100 sleep_between_describe_retries = 10 for i in range(max_describe_retries): try: sm_eval_job_info = self.sagemaker_client.describe_training_job( TrainingJobName=self.model_record._evaluation_job_name) except Exception as e: if "ValidationException" in str(e): print(e) if i > max_describe_retries: # 3rd attempt for DescribeTrainingJob with validation failure logger.warn( "Looks like SageMaker Job was not submitted successfully." f" Failing EvaluationJob {self.model_record._evaluation_job_name}" ) self.model_record.update_eval_job_as_failed() self.model_db_client.update_model_eval_as_failed( self._jsonify()) return else: time.sleep(sleep_between_describe_retries) continue else: # Do not raise exception, most probably throttling. logger.warn( "Failed to check SageMaker Training Job state for EvaluationJob: " f" {self.model_record._evaluation_job_name}. This exception will be ignored," " and retried.") time.sleep(sleep_between_describe_retries) return self._jsonify() eval_state = sm_eval_job_info.get("TrainingJobStatus", "Pending") if eval_state == "Completed": eval_score = "n.a." if self.local_mode: rgx = re.compile( "average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$", re.M) eval_score_rgx = rgx.findall(self.log_output) if len(eval_score_rgx) == 0: logger.warning("No eval score available from vw job log.") else: eval_score = eval_score_rgx[0][0] # [('eval_score', '')] else: attempts = 0 while eval_score == "n.a." and attempts < 4: try: metric_df = TrainingJobAnalytics( self.model_record._evaluation_job_name, ["average_loss"]).dataframe() eval_score = str(metric_df[metric_df["metric_name"] == "average_loss"]["value"][0]) except Exception: # to avoid throttling time.sleep(5) continue attempts += 1 self.model_record._eval_state = eval_state self.model_record.add_model_eval_scores(eval_score) self.model_db_client.update_model_eval_job_state(self._jsonify()) else: # update eval state via ddb client self.model_record.update_eval_job_state(eval_state) self.model_db_client.update_model_eval_job_state(self._jsonify())
def evaluate( self, input_data_s3_prefix=None, manifest_file_path=None, evaluation_job_name=None, local_mode=True, wait=False, logs=True, ): """A Estimator fit() call to start a model evaluation job. Args: input_data_s3_prefix (str): Defines the location of s3 data used for evaluation manifest_file_path (str): Manifest file used to provide evaluation data. evaluation_job_name (str): Unique Sagemaker job name to identify the evaluation job local_mode (bool): Whether the evaluation job is running on local mode wait (bool): Whether the call should wait until the job completes. Only meaningful when running in SageMaker mode. logs (bool): Whether to show the logs produced by the job. Only meaningful when wait is True. """ # use self.model_id, self._s3_model_output_path as the model to evaluate # Model object has already been initialized with up-to-date DDb record. model_artifact_path = self.model_record.get_model_artifact_path() rl_estimator_args = self._get_rl_estimator_args(eval=True) rl_estimator_args["model_channel_name"] = "pretrained_model" rl_estimator_args["model_uri"] = model_artifact_path if manifest_file_path: inputs = sagemaker.session.s3_input(s3_data=manifest_file_path, s3_data_type="ManifestFile") if local_mode: rl_estimator_args["hyperparameters"].update( {"local_mode_manifest": True}) else: inputs = input_data_s3_prefix # (dict[str, str] or dict[str, sagemaker.session.s3_input]) for evaluation channel eval_channel_inputs = {EVAL_CHANNEL: inputs} self.rl_estimator = RLEstimator(**rl_estimator_args) # update to save eval_data_s3_path in DDb as well, or # update to read from SM describe call... maybe will not work in local mode but. eval_data_s3_path = manifest_file_path if ( manifest_file_path is not None) else input_data_s3_prefix # we keep eval job state as pending, before the SM job has been submitted. # the syncer function should update this state, based on SM job status. self.model_record.add_new_evaluation_job_info( evaluation_job_name=evaluation_job_name, eval_data_s3_path=eval_data_s3_path) self.model_db_client.update_model_record(self._jsonify()) # The following local variables (unsaved to DDb) make evaluation job non-resumable. self.log_output = None self.local_mode = local_mode if local_mode: # Capture eval score by regex expression # log should contain only one "average loss = some number" pattern with CaptureStdout() as log_output: self.rl_estimator.fit(job_name=evaluation_job_name, inputs=eval_channel_inputs, wait=wait, logs=logs) self.log_output = "\n".join(log_output) logger.debug(self.log_output) else: self.rl_estimator.fit(job_name=evaluation_job_name, inputs=eval_channel_inputs, wait=wait, logs=logs)
instance_count=1, entry_point="entry_point.py", ), MXNet( framework_version="1.4.1", py_version="py3", role=sagemaker.get_execution_role(), instance_type=INSTANCE_TYPE, instance_count=1, entry_point="entry_point.py", ), RLEstimator( entry_point="cartpole.py", toolkit=RLToolkit.RAY, framework=RLFramework.TENSORFLOW, toolkit_version="0.8.5", role=sagemaker.get_execution_role(), instance_type=INSTANCE_TYPE, instance_count=1, ), Chainer( role=sagemaker.get_execution_role(), entry_point="entry_point.py", use_mpi=True, num_processes=4, framework_version="5.0.0", instance_type=INSTANCE_TYPE, instance_count=1, py_version="py3", ), ],