Esempio n. 1
0
def download_folder(Filename, Bucket, Key, session):
    if isinstance(session, boto3.Session):
        session = sagemaker.Session(boto_session=session)
    ensure_path(Filename)
    S3Downloader.download(s3_uri=f"s3://{Bucket}/{Key}",
                          local_path=Filename,
                          sagemaker_session=session)
Esempio n. 2
0
def copy_sample_flow_to_local(workspace, local_dir) :

    config = BPConfig.get_config(workspace, local_dir)
        
    fname = f"{local_dir}/{FLOW_NAME}"
    flow_uri = f"s3://{workspace}/{config.ws_prefix()}/meta/{FLOW_NAME}"
    S3Downloader.download(flow_uri, local_dir)

    # Change the flow definition so that it references the dataset copied over by the user
    def _update_sample_flow_def(fname, s3_uri) :
    
        with open(fname, 'r+') as f:
            flow_def = json.loads(f.read())
    
            nodes = flow_def["nodes"]

            for n in nodes :
                if n["type"] == "SOURCE" :
                    data_def = n["parameters"]["dataset_definition"]
                    dstype = data_def["datasetSourceType"]
                    if dstype == "S3" :
                        data_def["s3ExecutionContext"]["s3Uri"] = s3_uri
            f.seek(0)   
            f.write(json.dumps(flow_def))
            f.truncate()
            
    _update_sample_flow_def(fname, config.sample_data_uri())   
     
    return fname
def get_node_to_idx_mapping(training_job_name: str = None, dataprocessing_job_name: str = None,
                            model_artifacts_location: str = './model-artifacts', vertex_label: str = None):
    assert training_job_name is not None or dataprocessing_job_name is not None, \
        "You must provide either a modeltraining job id or a dataprocessing job id to obtain node to index mappings"

    job_name = training_job_name if training_job_name is not None else dataprocessing_job_name
    job_type = "modeltraining" if training_job_name == job_name else "dataprocessing"
    filename = "mapping.info" if training_job_name == job_name else "info.pkl"
    mapping_key = "node2id" if training_job_name == job_name else "node_id_map"

    # get mappings
    model_artifacts_location = os.path.join(model_artifacts_location, job_name)
    if not os.path.exists(os.path.join(model_artifacts_location, filename)):
        job_s3_output = get_neptune_ml_job_output_location(job_name, job_type)
        print(job_s3_output)
        if not job_s3_output:
            return
        S3Downloader.download(os.path.join(job_s3_output, filename), model_artifacts_location)

    with open(os.path.join(model_artifacts_location, filename), "rb") as f:
        mapping = pickle.load(f)[mapping_key]
        if vertex_label is not None:
            if vertex_label in mapping:
                mapping = mapping[vertex_label]
            else:
                print("Mapping for vertex label: {} not found.".format(vertex_label))
                print("valid vertex labels which have vertices mapped to embeddings: {} ".format(list(mapping.keys())))
                print("Returning mapping for all valid vertex labels")

    return mapping
Esempio n. 4
0
    def _download_bp_config(cls, config_uri=None):

        if not config_uri:
            config_uri = cls.default_config_uri(cls.workspace)

        S3Downloader.download(config_uri, cls.local_dir)

        fname = f"{cls.local_dir}/blueprint-config.json"
        return fname
Esempio n. 5
0
def download_model(model_data, local_path=".", unzip=False, sagemaker_session=None, model_dir="model"):
    """Downloads model file from sagemaker training to local directory and unzips its to directory if wanted."""
    S3Downloader.download(
        s3_uri=model_data, local_path=os.path.join(local_path, model_dir), sagemaker_session=sagemaker_session
    )
    if unzip:
        with tarfile.open(os.path.join(local_path, model_dir, "model.tar.gz"), "r:gz") as model_zip:
            model_zip.extractall(path=os.path.join(local_path, model_dir))
        os.remove(os.path.join(local_path, model_dir, "model.tar.gz"))
Esempio n. 6
0
def load_dataset(
    s3_uri: str, filename: str, sagemaker_session=None
) -> pd.DataFrame:
    """Load a data set from a S3 uri"""
    S3Downloader.download(
        s3_uri, tempfile.gettempdir(),
        sagemaker_session=sagemaker_session)
    dataset_filename = os.path.join(
        tempfile.gettempdir(), filename)
    return pd.read_csv(dataset_filename, header=None)
Esempio n. 7
0
def test_download(sagemaker_session):
    s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME, SOURCE_NAME)
    S3Downloader.download(s3_uri=s3_uri,
                          local_path="/path/for/download/",
                          session=sagemaker_session)
    sagemaker_session.download_data.assert_called_with(
        path="/path/for/download/",
        bucket=BUCKET_NAME,
        key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME),
        extra_args=None,
    )
Esempio n. 8
0
def test_download_with_kms_key(sagemaker_session):
    s3_uri = os.path.join("s3://", BUCKET_NAME, CURRENT_JOB_NAME, SOURCE_NAME)
    S3Downloader.download(s3_uri=s3_uri,
                          local_path="/path/for/download/",
                          kms_key=KMS_KEY,
                          session=sagemaker_session)
    sagemaker_session.download_data.assert_called_with(
        path="/path/for/download/",
        bucket=BUCKET_NAME,
        key_prefix=os.path.join(CURRENT_JOB_NAME, SOURCE_NAME),
        extra_args={"SSECustomerKey": KMS_KEY},
    )
Esempio n. 9
0
def test_s3_uploader_and_downloader_downloads_files_when_given_directory_uris_with_files(
    sagemaker_session, s3_files_kms_key
):
    my_uuid = str(uuid.uuid4())

    file_1_body = "First File Body {}.".format(my_uuid)
    file_1_name = "first_file_{}.txt".format(my_uuid)
    file_2_body = "Second File Body {}.".format(my_uuid)
    file_2_name = "second_file_{}.txt".format(my_uuid)

    base_s3_uri = os.path.join(
        "s3://", sagemaker_session.default_bucket(), "integ-test-test-s3-list", my_uuid
    )
    file_1_s3_uri = os.path.join(base_s3_uri, file_1_name)
    file_2_s3_uri = os.path.join(base_s3_uri, file_2_name)

    S3Uploader.upload_string_as_file_body(
        body=file_1_body,
        desired_s3_uri=file_1_s3_uri,
        kms_key=s3_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    S3Uploader.upload_string_as_file_body(
        body=file_2_body,
        desired_s3_uri=file_2_s3_uri,
        kms_key=s3_files_kms_key,
        sagemaker_session=sagemaker_session,
    )

    s3_uris = S3Downloader.list(s3_uri=base_s3_uri, sagemaker_session=sagemaker_session)

    assert file_1_name in s3_uris[0]
    assert file_2_name in s3_uris[1]

    assert file_1_body == S3Downloader.read_file(
        s3_uri=s3_uris[0], sagemaker_session=sagemaker_session
    )
    assert file_2_body == S3Downloader.read_file(
        s3_uri=s3_uris[1], sagemaker_session=sagemaker_session
    )

    S3Downloader.download(
        s3_uri=base_s3_uri, local_path=TMP_BASE_PATH, sagemaker_session=sagemaker_session
    )

    with open(os.path.join(TMP_BASE_PATH, file_1_name), "r") as f:
        assert file_1_body == f.read()

    with open(os.path.join(TMP_BASE_PATH, file_2_name), "r") as f:
        assert file_2_body == f.read()
Esempio n. 10
0
    def _download_clarify_xai_summary(self):

        try:

            summary_uri = f"s3://{self.bucket}/{self.xai_prefix}/analysis.json"
            S3Downloader.download(summary_uri, os.getcwd())

            with open('analysis.json', 'r') as f:
                summary = json.loads(f.read())

            return summary

        except Exception as e:
            print(f"{e}: Failed to download {xai_summary}")
def get_embeddings(training_job_name: str, download_location: str = './model-artifacts'):
    training_job_s3_output = get_modeltraining_job_output_location(training_job_name)
    if not training_job_s3_output:
        return

    download_location = os.path.join(download_location, training_job_name)
    os.makedirs(download_location, exist_ok=True)
    # download embeddings and mapping info

    S3Downloader.download(os.path.join(training_job_s3_output, "embeddings/"),
                          os.path.join(download_location, "embeddings/"))

    entity_emb = np.load(os.path.join(download_location, "embeddings", "entity.npy"))

    return entity_emb
def get_performance_metrics(training_job_name: str, download_location: str = './model-artifacts'):
    training_job_s3_output = get_modeltraining_job_output_location(training_job_name)
    if not training_job_s3_output:
        return

    download_location = os.path.join(download_location, training_job_name)
    os.makedirs(download_location, exist_ok=True)
    # download embeddings and mapping info

    S3Downloader.download(os.path.join(training_job_s3_output, "eval_metrics_info.json"),
                          download_location)

    with open(os.path.join(download_location, "eval_metrics_info.json")) as f:
        metrics = json.load(f)

    return metrics
def get_predictions(training_job_name: str, download_location: str = './model-artifacts', class_preds: bool = False):
    training_job_s3_output = get_modeltraining_job_output_location(training_job_name)
    if not training_job_s3_output:
        return

    download_location = os.path.join(download_location, training_job_name)
    os.makedirs(download_location, exist_ok=True)
    # download embeddings and mapping info

    S3Downloader.download(os.path.join(training_job_s3_output, "predictions/"),
                          os.path.join(download_location, "predictions/"))

    preds = np.load(os.path.join(download_location, "predictions", "result.npz"))['infer_scores']

    if class_preds:
        return preds.argmax(axis=1)

    return preds
Esempio n. 14
0
    def _inject_repack_script(self):
        """Injects the _repack_model.py script where it belongs.

        If the source_dir is an S3 path:
            1) downloads the source_dir tar.gz
            2) copies the _repack_model.py script where it belongs
            3) uploads the mutated source_dir

        If the source_dir is a local path:
            1) copies the _repack_model.py script into the source dir
        """
        fname = os.path.join(os.path.dirname(__file__), REPACK_SCRIPT)
        if self._source_dir.lower().startswith("s3://"):
            with tempfile.TemporaryDirectory() as tmp:
                local_path = os.path.join(tmp, "local.tar.gz")

                S3Downloader.download(
                    s3_uri=self._source_dir,
                    local_path=local_path,
                    sagemaker_session=self._estimator.sagemaker_session,
                )

                src_dir = os.path.join(tmp, "src")
                with tarfile.open(name=local_path, mode="r:gz") as tf:
                    tf.extractall(path=src_dir)

                shutil.copy2(fname, os.path.join(src_dir, REPACK_SCRIPT))
                with tarfile.open(name=local_path, mode="w:gz") as tf:
                    tf.add(src_dir, arcname=".")

                S3Uploader.upload(
                    local_path=local_path,
                    desired_s3_uri=self._source_dir,
                    sagemaker_session=self._estimator.sagemaker_session,
                )
        else:
            shutil.copy2(fname, os.path.join(self._source_dir, REPACK_SCRIPT))