Beispiel #1
0
 def download_dir_contents(self, prefix, local_dir):
     util.mkdir_p(local_dir)
     prefix = util.ensure_suffix(prefix, "/")
     for key in self._get_matching_s3_keys_generator(prefix):
         rel_path = util.trim_prefix(key, prefix)
         local_dest_path = os.path.join(local_dir, rel_path)
         self.download_file(key, local_dest_path)
Beispiel #2
0
 def _get_dir(self, prefix, local_dir):
     prefix = util.add_suffix_unless_present(prefix, "/")
     util.mkdir_p(local_dir)
     for key in self._get_matching_s3_keys_generator(prefix):
         rel_path = util.remove_prefix_if_present(key, prefix)
         local_dest_path = os.path.join(local_dir, rel_path)
         self.download_file(key, local_dest_path)
Beispiel #3
0
 def download_dir_contents(self, prefix: str, local_dir: str):
     util.mkdir_p(local_dir)
     prefix = util.ensure_suffix(prefix, "/")
     for blob in self.gcs.list_blobs(prefix=prefix):
         if blob.name.endswith("/"):
             continue
         relative_path = util.trim_prefix(blob.name, prefix)
         local_dest_path = os.path.join(local_dir, relative_path)
         self.download_file(blob.name, local_dest_path)
Beispiel #4
0
 def download_file(self, key, local_path):
     util.mkdir_p(os.path.dirname(local_path))
     try:
         self.s3.download_file(self.bucket, key, local_path)
         return local_path
     except Exception as e:
         raise CortexException(
             'key "{}" in bucket "{}" could not be accessed; '.format(key, self.bucket)
             + "it may not exist, or you may not have sufficient permissions"
         ) from e
Beispiel #5
0
 def download_file_external(self, s3_path, local_path):
     util.mkdir_p(os.path.dirname(local_path))
     bucket, key = self.deconstruct_s3_path(s3_path)
     try:
         self.s3.download_file(bucket, key, local_path)
         return local_path
     except Exception as e:
         raise CortexException(
             'key "{}" in bucket "{}" could not be accessed; '.format(
                 key, bucket) +
             "it may not exist, or you may not have suffienct permissions"
         ) from e
Beispiel #6
0
    def download_file(self, key: str, local_path: str):
        """
        Download file to the specified local path.
        """
        blob = self.gcs.get_blob(blob_name=key)
        if not isinstance(blob, storage.blob.Blob):
            raise CortexException(f'key "{key}" in bucket "{self.gcs.name}" not found')

        util.mkdir_p(os.path.dirname(local_path))
        try:
            blob.download_to_filename(local_path)
        except gexp.NotFound:
            raise CortexException(f'key "{key}" in bucket "{self.gcs.name}" not found')
Beispiel #7
0
    def __init__(self, **kwargs):
        if "cache_dir" in kwargs:
            self.cache_dir = kwargs["cache_dir"]
        elif "local_path" in kwargs:
            local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"]))
            self.cache_dir = os.path.join(local_path_dir, "cache")
        else:
            raise ValueError("cache_dir must be specified (or inferred from local_path)")
        util.mkdir_p(self.cache_dir)

        if "local_path" in kwargs:
            self.ctx = util.read_msgpack(kwargs["local_path"])
        elif "obj" in kwargs:
            self.ctx = kwargs["obj"]
        elif "raw_obj" in kwargs:
            self.ctx = kwargs["raw_obj"]
        elif "s3_path":
            local_ctx_path = os.path.join(self.cache_dir, "context.msgpack")
            bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"])
            S3(bucket, client_config={}).download_file(key, local_ctx_path)
            self.ctx = util.read_msgpack(local_ctx_path)
        else:
            raise ValueError("invalid context args: " + kwargs)

        self.workload_id = kwargs.get("workload_id")

        self.id = self.ctx["id"]
        self.key = self.ctx["key"]
        self.metadata_root = self.ctx["metadata_root"]
        self.cortex_config = self.ctx["cortex_config"]
        self.deployment_version = self.ctx["deployment_version"]
        self.root = self.ctx["root"]
        self.status_prefix = self.ctx["status_prefix"]
        self.app = self.ctx["app"]
        self.apis = self.ctx["apis"] or {}
        self.api_version = self.cortex_config["api_version"]
        self.monitoring = None
        self.project_id = self.ctx["project_id"]
        self.project_key = self.ctx["project_key"]

        if "local_storage_path" in kwargs:
            self.storage = LocalStorage(base_dir=kwargs["local_storage_path"])
        else:
            self.storage = S3(
                bucket=self.cortex_config["bucket"],
                region=self.cortex_config["region"],
                client_config={},
            )

        host_ip = os.environ["HOST_IP"]
        datadog.initialize(statsd_host=host_ip, statsd_port="8125")
        self.statsd = datadog.statsd

        if self.api_version != consts.CORTEX_VERSION:
            raise ValueError(
                "API version mismatch (Context: {}, Image: {})".format(
                    self.api_version, consts.CORTEX_VERSION
                )
            )

        # This affects Tensorflow S3 access
        os.environ["AWS_REGION"] = self.cortex_config.get("region", "")

        # ID maps
        self.apis_id_map = ResourceMap(self.apis) if self.apis else None
        self.id_map = self.apis_id_map
Beispiel #8
0
 def download_and_unzip_external(self, s3_path, local_dir):
     util.mkdir_p(local_dir)
     local_zip = os.path.join(local_dir, "zip.zip")
     self.download_file_external(s3_path, local_zip)
     util.extract_zip(local_zip, delete_zip_file=True)
Beispiel #9
0
    def __init__(self, **kwargs):
        if "cache_dir" in kwargs:
            self.cache_dir = kwargs["cache_dir"]
        elif "local_path" in kwargs:
            local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"]))
            self.cache_dir = os.path.join(local_path_dir, "cache")
        else:
            raise ValueError("cache_dir must be specified (or inferred from local_path)")
        util.mkdir_p(self.cache_dir)

        if "local_path" in kwargs:
            ctx_raw = util.read_msgpack(kwargs["local_path"])
            self.ctx = _deserialize_raw_ctx(ctx_raw)
        elif "obj" in kwargs:
            self.ctx = kwargs["obj"]
        elif "raw_obj" in kwargs:
            ctx_raw = kwargs["raw_obj"]
            self.ctx = _deserialize_raw_ctx(ctx_raw)
        elif "s3_path":
            local_ctx_path = os.path.join(self.cache_dir, "context.msgpack")
            bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"])
            S3(bucket, client_config={}).download_file(key, local_ctx_path)
            ctx_raw = util.read_msgpack(local_ctx_path)
            self.ctx = _deserialize_raw_ctx(ctx_raw)
        else:
            raise ValueError("invalid context args: " + kwargs)

        self.workload_id = kwargs.get("workload_id")

        self.id = self.ctx["id"]
        self.key = self.ctx["key"]
        self.cortex_config = self.ctx["cortex_config"]
        self.dataset_version = self.ctx["dataset_version"]
        self.root = self.ctx["root"]
        self.raw_dataset = self.ctx["raw_dataset"]
        self.status_prefix = self.ctx["status_prefix"]
        self.app = self.ctx["app"]
        self.environment = self.ctx["environment"]
        self.python_packages = self.ctx["python_packages"] or {}
        self.raw_columns = self.ctx["raw_columns"] or {}
        self.transformed_columns = self.ctx["transformed_columns"] or {}
        self.transformers = self.ctx["transformers"] or {}
        self.aggregators = self.ctx["aggregators"] or {}
        self.aggregates = self.ctx["aggregates"] or {}
        self.constants = self.ctx["constants"] or {}
        self.models = self.ctx["models"] or {}
        self.estimators = self.ctx["estimators"] or {}
        self.apis = self.ctx["apis"] or {}
        self.training_datasets = {k: v["dataset"] for k, v in self.models.items()}
        self.api_version = self.cortex_config["api_version"]

        if "local_storage_path" in kwargs:
            self.storage = LocalStorage(base_dir=kwargs["local_storage_path"])
        else:
            self.storage = S3(
                bucket=self.cortex_config["bucket"],
                region=self.cortex_config["region"],
                client_config={},
            )

        if self.api_version != consts.CORTEX_VERSION:
            raise ValueError(
                "API version mismatch (Context: {}, Image: {})".format(
                    self.api_version, consts.CORTEX_VERSION
                )
            )

        self.columns = util.merge_dicts_overwrite(self.raw_columns, self.transformed_columns)

        self.raw_column_names = list(self.raw_columns.keys())
        self.transformed_column_names = list(self.transformed_columns.keys())
        self.column_names = list(self.columns.keys())

        # Internal caches
        self._transformer_impls = {}
        self._aggregator_impls = {}
        self._estimator_impls = {}
        self._metadatas = {}
        self._obj_cache = {}
        self.spark_uploaded_impls = {}

        # This affects Tensorflow S3 access
        os.environ["AWS_REGION"] = self.cortex_config.get("region", "")

        # Id map
        self.pp_id_map = ResourceMap(self.python_packages) if self.python_packages else None
        self.rf_id_map = ResourceMap(self.raw_columns) if self.raw_columns else None
        self.ag_id_map = ResourceMap(self.aggregates) if self.aggregates else None
        self.tf_id_map = ResourceMap(self.transformed_columns) if self.transformed_columns else None
        self.td_id_map = ResourceMap(self.training_datasets) if self.training_datasets else None
        self.models_id_map = ResourceMap(self.models) if self.models else None
        self.apis_id_map = ResourceMap(self.apis) if self.apis else None
        self.constants_id_map = ResourceMap(self.constants) if self.constants else None
        self.id_map = util.merge_dicts_overwrite(
            self.pp_id_map,
            self.rf_id_map,
            self.ag_id_map,
            self.tf_id_map,
            self.td_id_map,
            self.models_id_map,
            self.apis_id_map,
            self.constants_id_map,
        )
Beispiel #10
0
def train(model_name, estimator_impl, ctx, model_dir):
    model = ctx.models[model_name]

    util.mkdir_p(model_dir)
    util.rm_dir(model_dir)

    tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"])

    run_config = tf.estimator.RunConfig(
        tf_random_seed=model["training"]["tf_random_seed"],
        save_summary_steps=model["training"]["save_summary_steps"],
        save_checkpoints_secs=model["training"]["save_checkpoints_secs"],
        save_checkpoints_steps=model["training"]["save_checkpoints_steps"],
        log_step_count_steps=model["training"]["log_step_count_steps"],
        keep_checkpoint_max=model["training"]["keep_checkpoint_max"],
        keep_checkpoint_every_n_hours=model["training"]
        ["keep_checkpoint_every_n_hours"],
        model_dir=model_dir,
    )

    train_input_fn = generate_input_fn(model_name, ctx, "training",
                                       estimator_impl)
    eval_input_fn = generate_input_fn(model_name, ctx, "evaluation",
                                      estimator_impl)
    serving_input_fn = generate_json_serving_input_fn(model_name, ctx,
                                                      estimator_impl)
    exporter = tf.estimator.FinalExporter("estimator",
                                          serving_input_fn,
                                          as_text=False)

    train_num_steps = model["training"]["num_steps"]
    dataset_metadata = ctx.get_metadata(model["dataset"]["id"])
    if model["training"]["num_epochs"]:
        train_num_steps = (math.ceil(dataset_metadata["training_size"] /
                                     float(model["training"]["batch_size"])) *
                           model["training"]["num_epochs"])

    train_spec = tf.estimator.TrainSpec(train_input_fn,
                                        max_steps=train_num_steps)

    eval_num_steps = model["evaluation"]["num_steps"]
    if model["evaluation"]["num_epochs"]:
        eval_num_steps = (math.ceil(dataset_metadata["eval_size"] /
                                    float(model["evaluation"]["batch_size"])) *
                          model["evaluation"]["num_epochs"])

    eval_spec = tf.estimator.EvalSpec(
        eval_input_fn,
        steps=eval_num_steps,
        exporters=[exporter],
        name="estimator-eval",
        start_delay_secs=model["evaluation"]["start_delay_secs"],
        throttle_secs=model["evaluation"]["throttle_secs"],
    )

    model_config = ctx.model_config(model_name)

    try:
        tf_estimator = estimator_impl.create_estimator(run_config,
                                                       model_config)
    except Exception as e:
        raise UserRuntimeException("model " + model_name) from e

    target_col_name = util.get_resource_ref(model["target_column"])
    if ctx.get_inferred_column_type(
            target_col_name) == consts.COLUMN_TYPE_FLOAT:
        tf_estimator = tf.contrib.estimator.add_metrics(
            tf_estimator, get_regression_eval_metrics)

    tf.estimator.train_and_evaluate(tf_estimator, train_spec, eval_spec)

    return model_dir