def download_file(self, key, local_path): try: util.mkdir_p(os.path.dirname(local_path)) self.s3.download_file(self.bucket, key, local_path) return local_path except Exception as e: raise CortexException("bucket " + self.bucket, "key " + key) from e
def _get_dir(self, prefix, local_dir): prefix = util.add_suffix_unless_present(prefix, "/") util.mkdir_p(local_dir) for key in self._get_matching_s3_keys_generator(prefix): rel_path = util.remove_prefix_if_present(key, prefix) local_dest_path = os.path.join(local_dir, rel_path) self.download_file(key, local_dest_path)
def download_dir_from_s3(prefix, local_dir, bucket, client_config={}): prefix = util.add_suffix_unless_present(prefix, "/") util.mkdir_p(local_dir) for key in get_matching_s3_keys_generator(bucket, prefix, client_config=client_config): rel_path = util.remove_prefix_if_present(key, prefix) local_dest_path = os.path.join(local_dir, rel_path) download_file_from_s3(key, local_dest_path, bucket, client_config=client_config)
def download_file_from_s3(key, local_path, bucket, client_config={}): try: util.mkdir_p(os.path.dirname(local_path)) s3 = s3_client(client_config) s3.download_file(bucket, key, local_path) return local_path except Exception as e: raise CortexException("bucket " + bucket, "key " + key) from e
def download_file(self, key, local_path): util.mkdir_p(os.path.dirname(local_path)) try: self.s3.download_file(self.bucket, key, local_path) return local_path except Exception as e: raise CortexException( 'key "{}" in bucket "{}" could not be accessed; '.format( key, bucket) + "it may not exist, or you may not have suffienct permissions" ) from e
def write_files(self, files, base_path): if os.path.isdir(base_path): shutil.rmtree(base_path) for filepath, contents in files.iteritems(): filename = os.path.join(base_path, filepath) file_dir = os.path.dirname(filename) util.mkdir_p(file_dir) util.logger.debug("Writing out file %s" % filepath) f = open(filename, 'w') f.write(contents) f.close() util.logger.info("All done writing out input data")
def __init__(self, ids_or_urls=[], options={}): util.mkdir_p(self.test_cases_path()) if not os.path.isfile(http_client.certs_path()): msg = ("You seem to have deleted the file of certificates " "that shipped with this repo. It should exist " "at %s" % http_client.certs_path()) raise error.StripeError(msg) if ids_or_urls == []: util.logger.info('No test case supplied. Randomly choosing among defaults.') ids_or_urls = [SystemRandom().choice(self.DEFAULT_TEST_CASES)] self.test_cases = map(lambda token: TestCase(self, token), ids_or_urls) self.options = options headers = { 'User-Agent': 'Stripe TestHarness/%s' % (self.VERSION,), } self.http_client = http_client.new_default_http_client(headers=headers, verify_ssl_certs=True)
def train(model_name, model_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"] ["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training") eval_input_fn = generate_input_fn(model_name, ctx, "evaluation") serving_input_fn = generate_json_serving_input_fn(model_name, ctx) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) dataset_metadata = aws.read_json_from_s3(model["dataset"]["metadata_key"], ctx.bucket) train_num_steps = model["training"]["num_steps"] if model["training"]["num_epochs"]: train_num_steps = ( math.ceil(dataset_metadata["dataset_size"] * model["data_partition_ratio"]["training"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"]) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = ( math.ceil(dataset_metadata["dataset_size"] * model["data_partition_ratio"]["evaluation"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"]) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model["name"]) tf_lib.add_tf_types(model_config) try: estimator = model_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e if model["type"] == "regression": estimator = tf.contrib.estimator.add_metrics( estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return model_dir
def download_and_extract_zip(key, local_dir, bucket, client_config={}): util.mkdir_p(local_dir) local_zip = os.path.join(local_dir, "zip.zip") download_file_from_s3(key, local_zip, bucket, client_config=client_config) util.extract_zip(local_zip, delete_zip_file=True)
def download_and_unzip(self, key, local_dir): util.mkdir_p(local_dir) local_zip = os.path.join(local_dir, "zip.zip") self.download_file(key, local_zip) util.extract_zip(local_zip, delete_zip_file=True)
def __init__(self, **kwargs): if "cache_dir" in kwargs: self.cache_dir = kwargs["cache_dir"] elif "local_path" in kwargs: local_path_dir = os.path.dirname( os.path.abspath(kwargs["local_path"])) self.cache_dir = os.path.join(local_path_dir, "cache") else: raise ValueError( "cache_dir must be specified (or inferred from local_path)") util.mkdir_p(self.cache_dir) if "local_path" in kwargs: ctx_raw = util.read_msgpack(kwargs["local_path"]) self.ctx = _deserialize_raw_ctx(ctx_raw) elif "obj" in kwargs: self.ctx = kwargs["obj"] elif "raw_obj" in kwargs: ctx_raw = kwargs["raw_obj"] self.ctx = _deserialize_raw_ctx(ctx_raw) elif "s3_path": local_ctx_path = os.path.join(self.cache_dir, "context.msgpack") bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"]) S3(bucket, client_config={}).download_file(key, local_ctx_path) ctx_raw = util.read_msgpack(local_ctx_path) self.ctx = _deserialize_raw_ctx(ctx_raw) else: raise ValueError("invalid context args: " + kwargs) self.workload_id = kwargs.get("workload_id") self.id = self.ctx["id"] self.key = self.ctx["key"] self.cortex_config = self.ctx["cortex_config"] self.dataset_version = self.ctx["dataset_version"] self.root = self.ctx["root"] self.raw_dataset = self.ctx["raw_dataset"] self.status_prefix = self.ctx["status_prefix"] self.app = self.ctx["app"] self.environment = self.ctx["environment"] self.python_packages = self.ctx["python_packages"] self.raw_columns = self.ctx["raw_columns"] self.transformed_columns = self.ctx["transformed_columns"] self.transformers = self.ctx["transformers"] self.aggregators = self.ctx["aggregators"] self.aggregates = self.ctx["aggregates"] self.constants = self.ctx["constants"] self.models = self.ctx["models"] self.apis = self.ctx["apis"] self.training_datasets = { k: v["dataset"] for k, v in self.models.items() } self.api_version = self.cortex_config["api_version"] if "local_storage_path" in kwargs: self.storage = LocalStorage(base_dir=kwargs["local_storage_path"]) else: self.storage = S3( bucket=self.cortex_config["bucket"], region=self.cortex_config["region"], client_config={}, ) if self.api_version != consts.CORTEX_VERSION: raise ValueError( "API version mismatch (Context: {}, Image: {})".format( self.api_version, consts.CORTEX_VERSION)) self.columns = util.merge_dicts_overwrite( self.raw_columns, self.transformed_columns # self.aggregates ) self.values = util.merge_dicts_overwrite(self.aggregates, self.constants) self.raw_column_names = list(self.raw_columns.keys()) self.transformed_column_names = list(self.transformed_columns.keys()) self.column_names = list(self.columns.keys()) # Internal caches self._transformer_impls = {} self._aggregator_impls = {} self._model_impls = {} # This affects Tensorflow S3 access os.environ["AWS_REGION"] = self.cortex_config.get("region", "") # Id map self.pp_id_map = ResourceMap(self.python_packages) self.rf_id_map = ResourceMap(self.raw_columns) self.ag_id_map = ResourceMap(self.aggregates) self.tf_id_map = ResourceMap(self.transformed_columns) self.td_id_map = ResourceMap(self.training_datasets) self.models_id_map = ResourceMap(self.models) self.apis_id_map = ResourceMap(self.apis) self.constants_id_map = ResourceMap(self.constants) self.id_map = util.merge_dicts_overwrite( self.pp_id_map, self.rf_id_map, self.ag_id_map, self.tf_id_map, self.td_id_map, self.models_id_map, self.apis_id_map, self.constants_id_map, )
def train(model_name, estimator_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"]["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training", estimator_impl) eval_input_fn = generate_input_fn(model_name, ctx, "evaluation", estimator_impl) serving_input_fn = generate_json_serving_input_fn(model_name, ctx, estimator_impl) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) train_num_steps = model["training"]["num_steps"] dataset_metadata = ctx.get_metadata(model["dataset"]["id"]) if model["training"]["num_epochs"]: train_num_steps = ( math.ceil(dataset_metadata["training_size"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"] ) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = ( math.ceil(dataset_metadata["eval_size"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"] ) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model_name) try: tf_estimator = estimator_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e target_col_name = util.get_resource_ref(model["target_column"]) if ctx.get_inferred_column_type(target_col_name) == consts.COLUMN_TYPE_FLOAT: tf_estimator = tf.contrib.estimator.add_metrics(tf_estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(tf_estimator, train_spec, eval_spec) return model_dir