Esempio n. 1
0
def validate_dataset(ctx, raw_df, cols_to_validate):
    total_row_count = aws.read_json_from_s3(ctx.raw_dataset["metadata_key"],
                                            ctx.bucket)["dataset_size"]
    conditions_dict = spark_util.value_check_data(ctx, raw_df,
                                                  cols_to_validate)

    if len(conditions_dict) > 0:
        for column, cond_count_list in conditions_dict.items():
            for condition, fail_count in cond_count_list:
                logger.error(
                    "Data validation {} has been violated in {}/{} samples".
                    format(condition, fail_count, total_row_count))
        raise UserException("raw column validations failed")
Esempio n. 2
0
def train(model_name, model_impl, ctx, model_dir):
    model = ctx.models[model_name]

    util.mkdir_p(model_dir)
    util.rm_dir(model_dir)

    tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"])

    run_config = tf.estimator.RunConfig(
        tf_random_seed=model["training"]["tf_random_seed"],
        save_summary_steps=model["training"]["save_summary_steps"],
        save_checkpoints_secs=model["training"]["save_checkpoints_secs"],
        save_checkpoints_steps=model["training"]["save_checkpoints_steps"],
        log_step_count_steps=model["training"]["log_step_count_steps"],
        keep_checkpoint_max=model["training"]["keep_checkpoint_max"],
        keep_checkpoint_every_n_hours=model["training"]
        ["keep_checkpoint_every_n_hours"],
        model_dir=model_dir,
    )

    train_input_fn = generate_input_fn(model_name, ctx, "training")
    eval_input_fn = generate_input_fn(model_name, ctx, "evaluation")
    serving_input_fn = generate_json_serving_input_fn(model_name, ctx)
    exporter = tf.estimator.FinalExporter("estimator",
                                          serving_input_fn,
                                          as_text=False)

    dataset_metadata = aws.read_json_from_s3(model["dataset"]["metadata_key"],
                                             ctx.bucket)
    train_num_steps = model["training"]["num_steps"]
    if model["training"]["num_epochs"]:
        train_num_steps = (
            math.ceil(dataset_metadata["dataset_size"] *
                      model["data_partition_ratio"]["training"] /
                      float(model["training"]["batch_size"])) *
            model["training"]["num_epochs"])

    train_spec = tf.estimator.TrainSpec(train_input_fn,
                                        max_steps=train_num_steps)

    eval_num_steps = model["evaluation"]["num_steps"]
    if model["evaluation"]["num_epochs"]:
        eval_num_steps = (
            math.ceil(dataset_metadata["dataset_size"] *
                      model["data_partition_ratio"]["evaluation"] /
                      float(model["evaluation"]["batch_size"])) *
            model["evaluation"]["num_epochs"])

    eval_spec = tf.estimator.EvalSpec(
        eval_input_fn,
        steps=eval_num_steps,
        exporters=[exporter],
        name="estimator-eval",
        start_delay_secs=model["evaluation"]["start_delay_secs"],
        throttle_secs=model["evaluation"]["throttle_secs"],
    )

    model_config = ctx.model_config(model["name"])
    tf_lib.add_tf_types(model_config)

    try:
        estimator = model_impl.create_estimator(run_config, model_config)
    except Exception as e:
        raise UserRuntimeException("model " + model_name) from e

    if model["type"] == "regression":
        estimator = tf.contrib.estimator.add_metrics(
            estimator, get_regression_eval_metrics)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    return model_dir
Esempio n. 3
0
 def get_resource_status(self, resource):
     key = self.resource_status_key(resource)
     return aws.read_json_from_s3(key, self.bucket)