def validate_dataset(ctx, raw_df, cols_to_validate): total_row_count = aws.read_json_from_s3(ctx.raw_dataset["metadata_key"], ctx.bucket)["dataset_size"] conditions_dict = spark_util.value_check_data(ctx, raw_df, cols_to_validate) if len(conditions_dict) > 0: for column, cond_count_list in conditions_dict.items(): for condition, fail_count in cond_count_list: logger.error( "Data validation {} has been violated in {}/{} samples". format(condition, fail_count, total_row_count)) raise UserException("raw column validations failed")
def train(model_name, model_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"] ["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training") eval_input_fn = generate_input_fn(model_name, ctx, "evaluation") serving_input_fn = generate_json_serving_input_fn(model_name, ctx) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) dataset_metadata = aws.read_json_from_s3(model["dataset"]["metadata_key"], ctx.bucket) train_num_steps = model["training"]["num_steps"] if model["training"]["num_epochs"]: train_num_steps = ( math.ceil(dataset_metadata["dataset_size"] * model["data_partition_ratio"]["training"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"]) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = ( math.ceil(dataset_metadata["dataset_size"] * model["data_partition_ratio"]["evaluation"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"]) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model["name"]) tf_lib.add_tf_types(model_config) try: estimator = model_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e if model["type"] == "regression": estimator = tf.contrib.estimator.add_metrics( estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return model_dir
def get_resource_status(self, resource): key = self.resource_status_key(resource) return aws.read_json_from_s3(key, self.bucket)