def training_job_analytics(self): """Return a ``TrainingJobAnalytics`` object for the current training job. """ if self._current_job_name is None: raise ValueError('Estimator is not associated with a TrainingJob') return TrainingJobAnalytics(self._current_job_name, sagemaker_session=self.sagemaker_session)
def test_trainer_dataframe(): session = create_sagemaker_session(describe_training_result=_describe_training_result(), metric_stats_results=_metric_stats_results()) trainer = TrainingJobAnalytics("my-training-job", ["train:acc"], sagemaker_session=session) df = trainer.dataframe() assert df is not None assert len(df) == 3 assert min(df['value']) == 77.1 assert max(df['value']) == 97.1 # Export to CSV and check that file exists tmp_name = "/tmp/unit-test-%s.csv" % uuid.uuid4() assert not os.path.isfile(tmp_name) trainer.export_csv(tmp_name) assert os.path.isfile(tmp_name) os.unlink(tmp_name)
def test_trainer_name(): describe_training_result = { "TrainingStartTime": datetime.datetime(2018, 5, 16, 1, 2, 3), "TrainingEndTime": datetime.datetime(2018, 5, 16, 5, 6, 7), } session = create_sagemaker_session(describe_training_result) trainer = TrainingJobAnalytics("my-training-job", ["metric"], sagemaker_session=session) assert trainer.name == "my-training-job" assert str(trainer).find("my-training-job") != -1
def test_trainer_dataframe(): describe_training_result = { 'TrainingStartTime': datetime.datetime(2018, 5, 16, 1, 2, 3), 'TrainingEndTime': datetime.datetime(2018, 5, 16, 5, 6, 7), } metric_stats_results = { 'Datapoints': [ { 'Average': 77.1, 'Timestamp': datetime.datetime(2018, 5, 16, 1, 3, 3), }, { 'Average': 87.1, 'Timestamp': datetime.datetime(2018, 5, 16, 1, 8, 3), }, { 'Average': 97.1, 'Timestamp': datetime.datetime(2018, 5, 16, 2, 3, 3), }, ] } session = sagemaker_session( describe_training_result=describe_training_result, metric_stats_results=metric_stats_results) trainer = TrainingJobAnalytics("my-training-job", ["train:acc"], sagemaker_session=session) df = trainer.dataframe() assert df is not None assert len(df) == 3 assert min(df['value']) == 77.1 assert max(df['value']) == 97.1 # Export to CSV and check that file exists tmp_name = "/tmp/unit-test-%s.csv" % uuid.uuid4() assert not os.path.isfile(tmp_name) trainer.export_csv(tmp_name) assert os.path.isfile(tmp_name) os.unlink(tmp_name)
def test_start_time_end_time_and_period_specified(): describe_training_result = { 'TrainingStartTime': datetime.datetime(2018, 5, 16, 1, 2, 3), 'TrainingEndTime': datetime.datetime(2018, 5, 16, 5, 6, 7), } session = create_sagemaker_session(describe_training_result) start_time = datetime.datetime(2018, 5, 16, 1, 3, 4) end_time = datetime.datetime(2018, 5, 16, 5, 1, 1) period = 300 trainer = TrainingJobAnalytics('my-training-job', ['metric'], sagemaker_session=session, start_time=start_time, end_time=end_time, period=period) assert trainer._time_interval['start_time'] == start_time assert trainer._time_interval['end_time'] == end_time assert trainer._period == period
def log_sagemaker_job_by_name(sagemaker_job_name, api_key=None, workspace=None, project_name=None): # Metadata client = _get_boto_client() metadata = client.describe_training_job(TrainingJobName=sagemaker_job_name) if metadata["TrainingJobStatus"] != "Completed": raise ValueError("Not importing %r as it's not completed, status %r" % (sagemaker_job_name, metadata["TrainingJobStatus"])) experiment = APIExperiment( api_key=api_key, workspace=workspace, project_name=project_name, experiment_name=sagemaker_job_name, ) start_time = metadata["TrainingStartTime"] start_time_timestamp = calendar.timegm(start_time.utctimetuple()) experiment.set_start_time(start_time_timestamp * 1000) end_time = metadata.get("TrainingEndTime") if end_time: experiment.set_end_time( calendar.timegm(end_time.utctimetuple()) * 1000) for param_name, param_value in metadata["HyperParameters"].items(): experiment.log_parameter(param_name, param_value) other_list = [ "BillableTimeInSeconds", "EnableInterContainerTrafficEncryption", "EnableManagedSpotTraining", "EnableNetworkIsolation", "RoleArn", "TrainingJobArn", "TrainingJobName", "TrainingJobStatus", "TrainingTimeInSeconds", ] for other_name in other_list: other_value = metadata.get(other_name) if other_value: experiment.log_other(other_name, other_value) experiment.log_other("TrainingImage", metadata["AlgorithmSpecification"]["TrainingImage"]) experiment.log_other( "TrainingInputMode", metadata["AlgorithmSpecification"]["TrainingInputMode"]) for other_key, other_value in _flatten(metadata.get("ModelArtifacts", {}), "ModelArtifacts").items(): experiment.log_other(other_key, other_value) for other_key, other_value in _flatten(metadata["OutputDataConfig"], "OutputDataConfig").items(): experiment.log_other(other_key, other_value) for other_key, other_value in _flatten(metadata["ResourceConfig"], "ResourceConfig").items(): experiment.log_other(other_key, other_value) for i, _input in enumerate(metadata["InputDataConfig"]): for other_key, other_value in _flatten(_input, "InputDataConfig.%d" % i).items(): experiment.log_other(other_key, other_value) response = client.list_tags(ResourceArn=metadata["TrainingJobArn"]) for tag_name, tag_value in response["Tags"]: experiment.add_tags(["%s:%s" % (tag_name, tag_value)]) # Metrics metrics_dataframe = TrainingJobAnalytics( training_job_name=sagemaker_job_name).dataframe() for iloc, (timestamp, metric_name, value) in metrics_dataframe.iterrows(): print("TS", start_time_timestamp + timestamp) experiment.log_metric(metric=metric_name, value=value, timestamp=start_time_timestamp + timestamp) return experiment
def _update_model_table_evaluation_states(self): """Update the evaluation states in the model table. This method will poll the Sagemaker evaluation job and then update evaluation job metadata of the model, including: eval_state, eval_scores Args: model_record (dict): Current model record in the model table """ if self.model_record.eval_in_terminal_state(): self.model_db_client.update_model_record(self._jsonify()) return self._jsonify() # Try and fetch updated SageMaker Training Job Status sm_eval_job_info = {} max_describe_retries = 100 sleep_between_describe_retries = 10 for i in range(max_describe_retries): try: sm_eval_job_info = self.sagemaker_client.describe_training_job( TrainingJobName=self.model_record._evaluation_job_name) except Exception as e: if "ValidationException" in str(e): print(e) if i > max_describe_retries: # 3rd attempt for DescribeTrainingJob with validation failure logger.warn( "Looks like SageMaker Job was not submitted successfully." f" Failing EvaluationJob {self.model_record._evaluation_job_name}" ) self.model_record.update_eval_job_as_failed() self.model_db_client.update_model_eval_as_failed( self._jsonify()) return else: time.sleep(sleep_between_describe_retries) continue else: # Do not raise exception, most probably throttling. logger.warn( "Failed to check SageMaker Training Job state for EvaluationJob: " f" {self.model_record._evaluation_job_name}. This exception will be ignored," " and retried.") time.sleep(sleep_between_describe_retries) return self._jsonify() eval_state = sm_eval_job_info.get("TrainingJobStatus", "Pending") if eval_state == "Completed": eval_score = "n.a." if self.local_mode: rgx = re.compile( "average loss = ([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?).*$", re.M) eval_score_rgx = rgx.findall(self.log_output) if len(eval_score_rgx) == 0: logger.warning("No eval score available from vw job log.") else: eval_score = eval_score_rgx[0][0] # [('eval_score', '')] else: attempts = 0 while eval_score == "n.a." and attempts < 4: try: metric_df = TrainingJobAnalytics( self.model_record._evaluation_job_name, ["average_loss"]).dataframe() eval_score = str(metric_df[metric_df["metric_name"] == "average_loss"]["value"][0]) except Exception: # to avoid throttling time.sleep(5) continue attempts += 1 self.model_record._eval_state = eval_state self.model_record.add_model_eval_scores(eval_score) self.model_db_client.update_model_eval_job_state(self._jsonify()) else: # update eval state via ddb client self.model_record.update_eval_job_state(eval_state) self.model_db_client.update_model_eval_job_state(self._jsonify())
def training_job_analytics(self): """Returns a TrainingJobAnalytics object for the current training job. """ if self._current_job_name is None: raise ValueError('Estimator is not associated with a TrainingJob') return TrainingJobAnalytics(self._current_job_name)
hyperparameters={ 'epochs': 1, 'backend': 'gloo' }) estimator.fit({'training': inputs}) ######################################################################## # DONOT EDIT AFTER THIS LINE ######################################################################## training_job_name = estimator.latest_training_job.name # Get metric values metric_names = [metric['Name'] for metric in estimator.metric_definitions] metrics_dataframe = TrainingJobAnalytics( training_job_name=training_job_name, metric_names=metric_names).dataframe() # Report results rr = ResultReport() rr.report(estimator.model_data, metrics_dataframe) # Update leaderboard. Make sure the key name is right # Use any name if you don't want to use the leaderboard score_metric = 'test:accuracy' score_name = 'Test Accuracy' leaderboard_ascending = False if score_metric not in metric_names: print("leaderboard key name is not correct. No leaderboard support.") exit(-1)
"S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": bucket_path + "/" + prefix + "/test", "S3DataDistributionType": "FullyReplicated" } }, "ContentType": "application/x-parquet", "CompressionType": "None" } ] } client = boto3.client('sagemaker', region_name=region) client.create_training_job(**create_training_params) status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus'] print(status) while status !='Completed' and status!='Failed': time.sleep(60) status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus'] print(status) %matplotlib inline from sagemaker.analytics import TrainingJobAnalytics metric_name = 'validation:rmse' metrics_dataframe = TrainingJobAnalytics(training_job_name=job_name, metric_names=[metric_name]).dataframe() plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False) plt.set_ylabel(metric_name);
######################################################################## # DONOT EDIT AFTER THIS LINE ######################################################################## training_job_name = estimator.latest_training_job.name desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name) trained_model_location = desc['ModelArtifacts']['S3ModelArtifacts'] print(trained_model_location) import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(desc) # Get metric values metric_names = [ metric['Name'] for metric in metric_definitions ] metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name, metric_names=metric_names).dataframe() md = "## Trained Model\n* " + trained_model_location + \ "\n## Results\n"+ metrics_dataframe.to_markdown() print(md) # Add comment from report_pr_comment import add_comment, update_leaderboard add_comment(md) # Update leaderboard. Make sure the key name is right # Use any name if you don't want to use the leaderboard accuracy_name = 'test:accuracy' if accuracy_name not in metric_names: print("leaderboard key name is not correct. No leaderboard support.") exit(-1)