Example #1
0
def pred(datasource,
         estimator_string,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_column_names_map,
         result_col_name,
         feature_metas={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         is_pai=False,
         pai_table=""):
    # import custom model package
    sqlflow_submitter.import_model_def(estimator_string, globals())
    estimator = eval(estimator_string)

    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    model_params.update(feature_columns)

    is_estimator = issubclass(
        estimator,
        (tf.estimator.Estimator, tf.estimator.BoostedTreesClassifier,
         tf.estimator.BoostedTreesRegressor))
    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table, is_pai,
                      pai_table, feature_column_names, feature_metas,
                      result_col_name, datasource, select, hdfs_namenode_addr,
                      hive_location, hdfs_user, hdfs_pass)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(estimator, model_params, save, result_table,
                          feature_column_names, feature_column_names_map,
                          feature_columns, feature_metas, result_col_name,
                          datasource, select, hdfs_namenode_addr,
                          hive_location, hdfs_user, hdfs_pass, is_pai,
                          pai_table)

    print("Done predicting. Predict table : %s" % result_table)
Example #2
0
def evaluate(datasource,
             estimator_string,
             select,
             result_table,
             feature_columns,
             feature_column_names,
             feature_metas={},
             label_meta={},
             model_params={},
             validation_metrics=["Accuracy"],
             save="",
             batch_size=1,
             validation_steps=None,
             verbose=0,
             hdfs_namenode_addr="",
             hive_location="",
             hdfs_user="",
             hdfs_pass="",
             is_pai=False,
             pai_table=""):
    # import custom model package
    sqlflow_submitter.import_model_def(estimator_string, globals())
    estimator_cls = eval(estimator_string)

    is_estimator = is_tf_estimator(estimator_cls)

    set_log_level(verbose, is_estimator)

    eval_dataset, _ = get_dataset_fn(select,
                                     "",
                                     datasource,
                                     feature_column_names,
                                     feature_metas,
                                     label_meta,
                                     is_pai,
                                     pai_table,
                                     "",
                                     1,
                                     batch_size,
                                     1,
                                     is_estimator=is_estimator)

    model_params.update(feature_columns)
    if is_estimator:
        if is_pai:
            FLAGS = tf.app.flags.FLAGS
            model_params["model_dir"] = FLAGS.checkpointDir
        else:
            model_params["model_dir"] = save
        # tf estimator always have feature_column argument
        estimator = estimator_cls(**model_params)
        result_metrics = estimator_evaluate(estimator, eval_dataset,
                                            validation_metrics)
    else:
        keras_model = init_model_with_feature_column(estimator, model_params)
        keras_model_pkg = sys.modules[estimator_cls.__module__]
        result_metrics = keras_evaluate(keras_model, eval_dataset, save,
                                        keras_model_pkg, validation_metrics)

    # write result metrics to a table
    if is_pai:
        driver = "pai_maxcompute"
        conn = None
    else:
        conn = connect_with_data_source(datasource)
        driver = conn.driver

    if result_table:
        metric_name_list = ["loss"] + validation_metrics
        write_result_metrics(result_metrics,
                             metric_name_list,
                             result_table,
                             driver,
                             conn,
                             hdfs_namenode_addr=hdfs_namenode_addr,
                             hive_location=hive_location,
                             hdfs_user=hdfs_user,
                             hdfs_pass=hdfs_pass)
Example #3
0
def train(datasource,
          estimator_string,
          select,
          validation_select,
          feature_columns,
          feature_column_names,
          feature_metas={},
          label_meta={},
          model_params={},
          validation_metrics=["Accuracy"],
          save="",
          batch_size=1,
          epoch=1,
          validation_steps=1,
          verbose=0,
          max_steps=None,
          validation_start_delay_secs=0,
          validation_throttle_secs=0,
          save_checkpoints_steps=100,
          log_every_n_iter=10,
          load_pretrained_model=False,
          is_pai=False,
          pai_table="",
          pai_val_table="",
          feature_columns_code="",
          model_repo_image="",
          original_sql=""):
    model_meta = collect_model_metadata(original_sql, select,
                                        validation_select, estimator_string,
                                        model_params, feature_columns_code,
                                        feature_metas, label_meta, None,
                                        model_repo_image)
    # import custom model package
    sqlflow_submitter.import_model_def(estimator_string, globals())
    estimator = eval(estimator_string)

    is_estimator = is_tf_estimator(estimator)

    if is_pai and verbose < 1:  # always use verbose == 1 when using PAI to get more logs
        verbose = 1
    set_log_level(verbose, is_estimator)
    # fill in feature columns parameters
    model_params.update(feature_columns)

    FLAGS = None
    num_workers = 1
    worker_id = 0
    # only support distributed training on PAI (TF version 1.x)
    if is_pai:
        FLAGS = define_tf_flags()
        set_oss_environs(FLAGS)
        num_workers = len(FLAGS.worker_hosts.split(","))
        worker_id = FLAGS.task_index

    # TODO(typhoonzero): remove this after update the keras models.
    # copy feature_name to name field for Keras functional models:
    # https://github.com/sql-machine-learning/models/blob/develop/sqlflow_models/dnnclassifier_functional_api_example.py
    for k in feature_metas:
        feature_metas[k]["name"] = feature_metas[k]["feature_name"]

    train_dataset_fn, val_dataset_fn = get_dataset_fn(
        select,
        validation_select,
        datasource,
        feature_column_names,
        feature_metas,
        label_meta,
        is_pai,
        pai_table,
        pai_val_table,
        epoch,
        batch_size,
        1000,
        num_workers=num_workers,
        worker_id=worker_id,
        is_estimator=is_estimator)

    if not is_estimator:  # keras
        if isinstance(estimator, types.FunctionType):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        keras_train_and_save(estimator, model_params, save, is_pai, FLAGS,
                             train_dataset_fn, val_dataset_fn, label_meta,
                             epoch, verbose, validation_metrics,
                             validation_steps, load_pretrained_model,
                             model_meta)
    else:
        estimator_train_and_save(estimator, model_params, save, is_pai, FLAGS,
                                 train_dataset_fn, val_dataset_fn,
                                 log_every_n_iter, max_steps,
                                 validation_start_delay_secs,
                                 validation_throttle_secs,
                                 save_checkpoints_steps, validation_metrics,
                                 load_pretrained_model, model_meta)

    # remove cache files
    any(map(os.remove, glob.glob('cache_train.*')))
    any(map(os.remove, glob.glob('cache_validation.*')))
    print("Done training")
Example #4
0
def explain(datasource,
            estimator_string,
            select,
            feature_columns,
            feature_column_names,
            feature_metas={},
            label_meta={},
            model_params={},
            save="",
            is_pai=False,
            pai_table="",
            plot_type='bar',
            result_table="",
            hdfs_namenode_addr="",
            hive_location="",
            hdfs_user="",
            hdfs_pass="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    # import custom model package
    sqlflow_submitter.import_model_def(estimator_string, globals())
    estimator_cls = eval(estimator_string)

    if is_pai:
        FLAGS = tf.app.flags.FLAGS
        model_params["model_dir"] = FLAGS.checkpointDir
    else:
        model_params['model_dir'] = save

    def _input_fn():
        if is_pai:
            dataset = input_fn("",
                               datasource,
                               feature_column_names,
                               feature_metas,
                               label_meta,
                               is_pai=True,
                               pai_table=pai_table)
        else:
            dataset = input_fn(select, datasource, feature_column_names,
                               feature_metas, label_meta)
        return dataset.batch(1).cache()

    model_params.update(feature_columns)

    estimator = init_model_with_feature_column(estimator_cls, model_params)

    if estimator_cls in (tf.estimator.BoostedTreesClassifier,
                         tf.estimator.BoostedTreesRegressor):
        explain_boosted_trees(datasource, estimator, _input_fn, plot_type,
                              result_table, feature_column_names, is_pai,
                              pai_table, hdfs_namenode_addr, hive_location,
                              hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk,
                              oss_endpoint, oss_bucket_name)
    else:
        shap_dataset = pd.DataFrame(columns=feature_column_names)
        for i, (features, label) in enumerate(_input_fn()):
            shap_dataset.loc[i] = [
                item.numpy()[0][0] for item in features.values()
            ]
        explain_dnns(datasource, estimator, shap_dataset, plot_type,
                     result_table, feature_column_names, is_pai, pai_table,
                     hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass,
                     oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)