def download_folder(Filename, Bucket, Key, session): if isinstance(session, boto3.Session): session = sagemaker.Session(boto_session=session) ensure_path(Filename) S3Downloader.download(s3_uri=f"s3://{Bucket}/{Key}", local_path=Filename, sagemaker_session=session)
def copy_sample_flow_to_local(workspace, local_dir) : config = BPConfig.get_config(workspace, local_dir) fname = f"{local_dir}/{FLOW_NAME}" flow_uri = f"s3://{workspace}/{config.ws_prefix()}/meta/{FLOW_NAME}" S3Downloader.download(flow_uri, local_dir) # Change the flow definition so that it references the dataset copied over by the user def _update_sample_flow_def(fname, s3_uri) : with open(fname, 'r+') as f: flow_def = json.loads(f.read()) nodes = flow_def["nodes"] for n in nodes : if n["type"] == "SOURCE" : data_def = n["parameters"]["dataset_definition"] dstype = data_def["datasetSourceType"] if dstype == "S3" : data_def["s3ExecutionContext"]["s3Uri"] = s3_uri f.seek(0) f.write(json.dumps(flow_def)) f.truncate() _update_sample_flow_def(fname, config.sample_data_uri()) return fname
def get_node_to_idx_mapping(training_job_name: str = None, dataprocessing_job_name: str = None, model_artifacts_location: str = './model-artifacts', vertex_label: str = None): assert training_job_name is not None or dataprocessing_job_name is not None, \ "You must provide either a modeltraining job id or a dataprocessing job id to obtain node to index mappings" job_name = training_job_name if training_job_name is not None else dataprocessing_job_name job_type = "modeltraining" if training_job_name == job_name else "dataprocessing" filename = "mapping.info" if training_job_name == job_name else "info.pkl" mapping_key = "node2id" if training_job_name == job_name else "node_id_map" # get mappings model_artifacts_location = os.path.join(model_artifacts_location, job_name) if not os.path.exists(os.path.join(model_artifacts_location, filename)): job_s3_output = get_neptune_ml_job_output_location(job_name, job_type) print(job_s3_output) if not job_s3_output: return S3Downloader.download(os.path.join(job_s3_output, filename), model_artifacts_location) with open(os.path.join(model_artifacts_location, filename), "rb") as f: mapping = pickle.load(f)[mapping_key] if vertex_label is not None: if vertex_label in mapping: mapping = mapping[vertex_label] else: print("Mapping for vertex label: {} not found.".format(vertex_label)) print("valid vertex labels which have vertices mapped to embeddings: {} ".format(list(mapping.keys()))) print("Returning mapping for all valid vertex labels") return mapping
def _download_bp_config(cls, config_uri=None): if not config_uri: config_uri = cls.default_config_uri(cls.workspace) S3Downloader.download(config_uri, cls.local_dir) fname = f"{cls.local_dir}/blueprint-config.json" return fname
def download_model(model_data, local_path=".", unzip=False, sagemaker_session=None, model_dir="model"): """Downloads model file from sagemaker training to local directory and unzips its to directory if wanted.""" S3Downloader.download( s3_uri=model_data, local_path=os.path.join(local_path, model_dir), sagemaker_session=sagemaker_session ) if unzip: with tarfile.open(os.path.join(local_path, model_dir, "model.tar.gz"), "r:gz") as model_zip: model_zip.extractall(path=os.path.join(local_path, model_dir)) os.remove(os.path.join(local_path, model_dir, "model.tar.gz"))
def load_dataset( s3_uri: str, filename: str, sagemaker_session=None ) -> pd.DataFrame: """Load a data set from a S3 uri""" S3Downloader.download( s3_uri, tempfile.gettempdir(), sagemaker_session=sagemaker_session) dataset_filename = os.path.join( tempfile.gettempdir(), filename) return pd.read_csv(dataset_filename, header=None)
def test_download(sagemaker_session): s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME, SOURCE_NAME) S3Downloader.download(s3_uri=s3_uri, local_path="/path/for/download/", session=sagemaker_session) sagemaker_session.download_data.assert_called_with( path="/path/for/download/", bucket=BUCKET_NAME, key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME), extra_args=None, )
def test_download_with_kms_key(sagemaker_session): s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME, SOURCE_NAME) S3Downloader.download(s3_uri=s3_uri, local_path="/path/for/download/", kms_key=KMS_KEY, session=sagemaker_session) sagemaker_session.download_data.assert_called_with( path="/path/for/download/", bucket=BUCKET_NAME, key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME), extra_args={"SSECustomerKey": KMS_KEY}, )
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_files( sagemaker_session, s3_files_kms_key ): my_uuid = str(uuid.uuid4()) file_1_body = "First File Body {}.".format(my_uuid) file_1_name = "first_file_{}.txt".format(my_uuid) file_2_body = "Second File Body {}.".format(my_uuid) file_2_name = "second_file_{}.txt".format(my_uuid) base_s3_uri = os.path.join( "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid ) file_1_s3_uri = os.path.join(base_s3_uri, file_1_name) file_2_s3_uri = os.path.join(base_s3_uri, file_2_name) S3Uploader.upload_string_as_file_body( body=file_1_body, desired_s3_uri=file_1_s3_uri, kms_key=s3_files_kms_key, sagemaker_session=sagemaker_session, ) S3Uploader.upload_string_as_file_body( body=file_2_body, desired_s3_uri=file_2_s3_uri, kms_key=s3_files_kms_key, sagemaker_session=sagemaker_session, ) s3_uris = S3Downloader.list(s3_uri=base_s3_uri, sagemaker_session=sagemaker_session) assert file_1_name in s3_uris[0] assert file_2_name in s3_uris[1] assert file_1_body == S3Downloader.read_file( s3_uri=s3_uris[0], sagemaker_session=sagemaker_session ) assert file_2_body == S3Downloader.read_file( s3_uri=s3_uris[1], sagemaker_session=sagemaker_session ) S3Downloader.download( s3_uri=base_s3_uri, local_path=TMP_BASE_PATH, sagemaker_session=sagemaker_session ) with open(os.path.join(TMP_BASE_PATH, file_1_name), "r") as f: assert file_1_body == f.read() with open(os.path.join(TMP_BASE_PATH, file_2_name), "r") as f: assert file_2_body == f.read()
def _download_clarify_xai_summary(self): try: summary_uri = f"s3://{self.bucket}/{self.xai_prefix}/analysis.json" S3Downloader.download(summary_uri, os.getcwd()) with open('analysis.json', 'r') as f: summary = json.loads(f.read()) return summary except Exception as e: print(f"{e}: Failed to download {xai_summary}")
def get_embeddings(training_job_name: str, download_location: str = './model-artifacts'): training_job_s3_output = get_modeltraining_job_output_location(training_job_name) if not training_job_s3_output: return download_location = os.path.join(download_location, training_job_name) os.makedirs(download_location, exist_ok=True) # download embeddings and mapping info S3Downloader.download(os.path.join(training_job_s3_output, "embeddings/"), os.path.join(download_location, "embeddings/")) entity_emb = np.load(os.path.join(download_location, "embeddings", "entity.npy")) return entity_emb
def get_performance_metrics(training_job_name: str, download_location: str = './model-artifacts'): training_job_s3_output = get_modeltraining_job_output_location(training_job_name) if not training_job_s3_output: return download_location = os.path.join(download_location, training_job_name) os.makedirs(download_location, exist_ok=True) # download embeddings and mapping info S3Downloader.download(os.path.join(training_job_s3_output, "eval_metrics_info.json"), download_location) with open(os.path.join(download_location, "eval_metrics_info.json")) as f: metrics = json.load(f) return metrics
def get_predictions(training_job_name: str, download_location: str = './model-artifacts', class_preds: bool = False): training_job_s3_output = get_modeltraining_job_output_location(training_job_name) if not training_job_s3_output: return download_location = os.path.join(download_location, training_job_name) os.makedirs(download_location, exist_ok=True) # download embeddings and mapping info S3Downloader.download(os.path.join(training_job_s3_output, "predictions/"), os.path.join(download_location, "predictions/")) preds = np.load(os.path.join(download_location, "predictions", "result.npz"))['infer_scores'] if class_preds: return preds.argmax(axis=1) return preds
def _inject_repack_script(self): """Injects the _repack_model.py script where it belongs. If the source_dir is an S3 path: 1) downloads the source_dir tar.gz 2) copies the _repack_model.py script where it belongs 3) uploads the mutated source_dir If the source_dir is a local path: 1) copies the _repack_model.py script into the source dir """ fname = os.path.join(os.path.dirname(__file__), REPACK_SCRIPT) if self._source_dir.lower().startswith("s3://"): with tempfile.TemporaryDirectory() as tmp: local_path = os.path.join(tmp, "local.tar.gz") S3Downloader.download( s3_uri=self._source_dir, local_path=local_path, sagemaker_session=self._estimator.sagemaker_session, ) src_dir = os.path.join(tmp, "src") with tarfile.open(name=local_path, mode="r:gz") as tf: tf.extractall(path=src_dir) shutil.copy2(fname, os.path.join(src_dir, REPACK_SCRIPT)) with tarfile.open(name=local_path, mode="w:gz") as tf: tf.add(src_dir, arcname=".") S3Uploader.upload( local_path=local_path, desired_s3_uri=self._source_dir, sagemaker_session=self._estimator.sagemaker_session, ) else: shutil.copy2(fname, os.path.join(self._source_dir, REPACK_SCRIPT))