コード例 #1
0
    def test_hive(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "hive":
            host = "127.0.0.1"
            port = "10000"
            conn = connect(driver,
                           "iris",
                           user="******",
                           password="******",
                           host=host,
                           port=port)
            self._do_test(driver,
                          conn,
                          hdfs_namenode_addr="127.0.0.1:8020",
                          hive_location="/sqlflow")
            conn.close()

            conn = connect_with_data_source(
                "hive://*****:*****@127.0.0.1:10000/iris")
            self._do_test(driver, conn)
            self._do_test_hive_specified_db(
                driver,
                conn,
                hdfs_namenode_addr="127.0.0.1:8020",
                hive_location="/sqlflow")
            conn.close()
コード例 #2
0
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                     feature_specs, is_pai, pai_explain_table):
    label_column_name = label_spec["feature_name"]
    if is_pai:
        pai_table_parts = pai_explain_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        stream = db.pai_maxcompute_db_generator(formated_pai_table,
                                                feature_column_names,
                                                label_column_name,
                                                feature_specs)
    else:
        conn = db.connect_with_data_source(datasource)
        stream = db.db_generator(conn.driver, conn, select,
                                 feature_column_names, label_spec,
                                 feature_specs)

    xs = pd.DataFrame(columns=feature_column_names)
    i = 0
    for row in stream():
        xs.loc[i] = [item[0] for item in row[0]]
        i += 1
    # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype
    # may be "object". Use below code to reproduce:
    # import pandas as pd
    # feature_column_names=["a", "b"]
    # xs = pd.DataFrame(columns=feature_column_names)
    # for i in range(10):
    #     xs.loc[i] = [int(j) for j in range(2)]
    # print(xs.dtypes)
    for fname in feature_column_names:
        dtype = feature_specs[fname]["dtype"]
        xs[fname] = xs[fname].astype(dtype)
    return xs
コード例 #3
0
ファイル: explain.py プロジェクト: zzszmyf/sqlflow
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, is_pai, pai_table, hdfs_namenode_addr,
                 hive_location, hdfs_user, hdfs_pass):
    def predict(d):
        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1)

        return np.array(
            [p['probabilities'][0] for p in estimator.predict(input_fn)])

    shap_values = shap.KernelExplainer(predict,
                                       shap_dataset).shap_values(shap_dataset)
    print(shap_values)
    for row in shap_values:
        print(list(row))
        print(len(list(row)))
    if result_table != "":
        if is_pai:
            write_shap_values(shap_values, "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values, conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
    else:
        explainer.plot_and_save(lambda: shap.summary_plot(
            shap_values, shap_dataset, show=False, plot_type=plot_type))
コード例 #4
0
ファイル: train.py プロジェクト: xujiameng/sqlflow
def train(datasource, select, model_params, train_params, feature_field_meta,
          label_field_meta, validation_select):
    conn = connect_with_data_source(datasource)

    # NOTE(tony): sorting is necessary to achieve consistent feature orders between training job and prediction/analysis job
    feature_column_name = [k["name"] for k in feature_field_meta]
    label_name = label_field_meta["name"]
    feature_spec = {k['name']: k for k in feature_field_meta}

    dtrain = xgb_dataset(conn, 'train.txt', select, feature_column_name,
                         label_name, feature_spec)
    watchlist = [(dtrain, "train")]
    if len(validation_select.strip()) > 0:
        dvalidate = xgb_dataset(conn, 'validate.txt', validation_select,
                                feature_column_name, label_name, feature_spec)
        watchlist.append((dvalidate, "validate"))

    re = dict()
    bst = xgb.train(model_params,
                    dtrain,
                    **train_params,
                    evals=watchlist,
                    evals_result=re)
    bst.save_model("my_model")
    print("Evaluation result: %s" % re)
コード例 #5
0
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, is_pai, pai_table, hdfs_namenode_addr,
                 hive_location, hdfs_user, hdfs_pass):
    def predict(d):
        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d,
                                  columns=shap_dataset.columns))).batch(1000)

        return np.array(
            [p['probabilities'][-1] for p in estimator.predict(input_fn)])

    if len(shap_dataset) > 100:
        # Reduce to 16 weighted samples to speed up
        shap_dataset_summary = shap.kmeans(shap_dataset, 16)
    else:
        shap_dataset_summary = shap_dataset
    shap_values = shap.KernelExplainer(
        predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic")
    if result_table != "":
        if is_pai:
            write_shap_values(shap_values, "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values, conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
    else:
        explainer.plot_and_save(lambda: shap.summary_plot(
            shap_values, shap_dataset, show=False, plot_type=plot_type))
コード例 #6
0
ファイル: predict.py プロジェクト: af3dgce/sqlflow
def estimator_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, label_meta,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass, is_pai, pai_table):
    classifier = estimator(**model_params)
    conn = connect_with_data_source(datasource)

    def fast_input_fn(generator):
        feature_types = []
        for name in feature_column_names:
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        def _inner_input_fn():
            if is_pai:
                dataset = pai_maxcompute_input_fn(pai_table, datasource,
                                                  feature_column_names,
                                                  feature_metas, label_meta)
            else:
                dataset = tf.data.Dataset.from_generator(
                    generator,
                    (tuple(feature_types), eval(
                        "tf.%s" % label_meta["dtype"])))
                ds_mapper = functools.partial(
                    parse_sparse_feature,
                    feature_column_names=feature_column_names,
                    feature_metas=feature_metas)
                dataset = dataset.map(ds_mapper)
            dataset = dataset.batch(1).cache()
            iterator = dataset.make_one_shot_iterator()
            features = iterator.get_next()
            return features

        return _inner_input_fn

    column_names = feature_column_names[:]
    column_names.append(label_meta["feature_name"])
    fast_predictor = FastPredict(classifier, fast_input_fn)

    with buffered_db_writer(conn.driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for features in db_generator(conn.driver, conn, select,
                                     feature_column_names,
                                     label_meta["feature_name"],
                                     feature_metas)():
            result = fast_predictor.predict(features)
            row = []
            for idx, _ in enumerate(feature_column_names):
                val = features[0][idx][0]
                row.append(str(val))
            if "class_ids" in list(result)[0]:
                row.append(str(list(result)[0]["class_ids"][0]))
            else:
                # regression predictions
                row.append(str(list(result)[0]["predictions"][0]))
            w.write(row)
コード例 #7
0
def pred(datasource,
         select,
         feature_metas,
         feature_column_names,
         label_meta,
         result_table,
         is_pai=False,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         pai_table=""):
    # TODO(typhoonzero): support running on PAI without MaxCompute AK/SK connection.
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    label_name = label_meta["feature_name"]

    dpred = xgb_dataset(datasource, 'predict.txt', select, feature_metas,
                        feature_column_names, None, is_pai, pai_table, True)

    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("Start predicting XGBoost model...")
    preds = bst.predict(dpred)

    # TODO(Yancey1989): using the train parameters to decide regression model or classifier model
    if len(preds.shape) == 2:
        # classifier result
        preds = np.argmax(np.array(preds), axis=1)
    feature_file_read = open("predict.txt", "r")

    result_column_names = feature_column_names
    result_column_names.append(label_name)
    line_no = 0
    if is_pai:
        driver = "pai_maxcompute"
        conn = None
    else:
        driver = conn.driver
    with db.buffered_db_writer(driver,
                               conn,
                               result_table,
                               result_column_names,
                               100,
                               hdfs_namenode_addr=hdfs_namenode_addr,
                               hive_location=hive_location,
                               hdfs_user=hdfs_user,
                               hdfs_pass=hdfs_pass) as w:
        while True:
            line = feature_file_read.readline()
            if not line:
                break
            row = [i.split(":")[1] for i in line.replace("\n", "").split("\t")]
            row.append(str(preds[line_no]))
            w.write(row)
            line_no += 1
    print("Done predicting. Predict table : %s" % result_table)
コード例 #8
0
def keras_predict(estimator, model_params, save, result_table,
                  feature_column_names, feature_metas, label_meta, datasource,
                  select, hdfs_namenode_addr, hive_location, hdfs_user,
                  hdfs_pass):
    classifier = estimator(**model_params)
    classifier_pkg = sys.modules[estimator.__module__]

    conn = connect_with_data_source(datasource)

    def eval_input_fn(batch_size, cache=False):
        feature_types = []
        for name in feature_column_names:
            # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
            if feature_metas[name]["is_sparse"]:
                feature_types.append((tf.int64, tf.int32, tf.int64))
            else:
                feature_types.append(get_dtype(feature_metas[name]["dtype"]))

        gen = db_generator(conn.driver, conn, select, feature_column_names,
                           label_meta["feature_name"], feature_metas)
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
        dataset = dataset.map(ds_mapper).batch(batch_size)
        if cache:
            dataset = dataset.cache()
        return dataset

    # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results
    #       to insert into result table.
    pred_dataset = eval_input_fn(1)
    one_batch = next(iter(pred_dataset))
    # NOTE: must run predict one batch to initialize parameters
    # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models
    classifier.predict_on_batch(one_batch[0])
    classifier.load_weights(save)
    pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator()
    buff_rows = []
    column_names = feature_column_names[:]
    column_names.append(label_meta["feature_name"])
    with buffered_db_writer(conn.driver, conn, result_table, column_names, 100,
                            hdfs_namenode_addr, hive_location, hdfs_user,
                            hdfs_pass) as w:
        for features in pred_dataset:
            result = classifier.predict_on_batch(features[0])
            result = classifier_pkg.prepare_prediction_column(result[0])
            row = []
            for idx, name in enumerate(feature_column_names):
                val = features[0][name].numpy()[0][0]
                row.append(str(val))
            row.append(str(result))
            w.write(row)
    del pred_dataset
コード例 #9
0
def keras_train_and_save(estimator, model_params, save, feature_column_names,
                         feature_metas, label_meta, datasource, select,
                         validate_select, batch_size, epochs, verbose):
    classifier = estimator(**model_params)
    classifier_pkg = sys.modules[estimator.__module__]
    if hasattr(classifier_pkg, "eval_metrics_fn"):
        metrics_functions = classifier_pkg.eval_metrics_fn()
        metrics = []
        for key, func in metrics_functions.items():
            func.__name__ = key
            metrics.append(func)
    else:
        metrics = ["accuracy"]

    conn = connect_with_data_source(datasource)
    # FIXME(typhoonzero): find a way to cache to local file and avoid cache lockfile already exists issue.
    train_dataset = input_fn(select, conn, feature_column_names, feature_metas,
                             label_meta)
    train_dataset = train_dataset.shuffle(SHUFFLE_SIZE).batch(
        batch_size).cache()
    if validate_select != "":
        validate_dataset = input_fn(validate_select, conn,
                                    feature_column_names, feature_metas,
                                    label_meta).batch(batch_size).cache()

    classifier.compile(optimizer=classifier_pkg.optimizer(),
                       loss=classifier_pkg.loss,
                       metrics=metrics)
    if hasattr(classifier, 'sqlflow_train_loop'):
        classifier.sqlflow_train_loop(train_dataset)
    else:
        if label_meta["feature_name"] != "" and validate_select != "":
            history = classifier.fit(train_dataset,
                                     epochs=epochs if epochs else
                                     classifier.default_training_epochs(),
                                     validation_data=validate_dataset,
                                     verbose=verbose)
        else:
            history = classifier.fit(train_dataset,
                                     epochs=epochs if epochs else
                                     classifier.default_training_epochs(),
                                     verbose=verbose)
        train_keys = []
        val_keys = []
        for k in history.history.keys():
            if k.startswith("val_"):
                val_keys.append(k)
            else:
                train_keys.append(k)
        print("====== Result for training set: ======")
        for k in train_keys:
            print("%s: %s" % (k, history.history[k][-1]))
        print("====== Result for validation set: ======")
        for k in val_keys:
            print("%s: %s" % (k, history.history[k][-1]))
    classifier.save_weights(save, save_format="h5")
コード例 #10
0
ファイル: explain.py プロジェクト: af3dgce/sqlflow
 def _input_fn():
     if is_pai:
         dataset = pai_maxcompute_input_fn(pai_table, datasource,
                                           feature_column_names,
                                           feature_metas, label_meta)
     else:
         conn = connect_with_data_source(datasource)
         dataset = input_fn(select, conn, feature_column_names,
                            feature_metas, label_meta)
     return dataset.batch(1).cache()
コード例 #11
0
ファイル: predict.py プロジェクト: zhikecore/sqlflow
def pred(datasource,
         select,
         feature_metas,
         feature_column_names,
         label_meta,
         result_table,
         is_pai=False,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         pai_table="",
         model_params=None,
         train_params=None,
         transform_fn=None,
         feature_column_code=""):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = None
    label_name = label_meta["feature_name"]
    dpred = xgb_dataset(
        datasource=datasource,
        fn='predict.txt',
        dataset_sql=select,
        feature_specs=feature_metas,
        feature_column_names=feature_column_names,
        label_spec=None,
        is_pai=is_pai,
        pai_table=pai_table,
        pai_single_file=True,
        cache=True,
        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
        transform_fn=transform_fn,
        feature_column_code=feature_column_code,
        raw_data_dir="predict.raw.dir")  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("Start predicting XGBoost model...")

    if is_pai:
        pai_table = "odps://{}/tables/{}".format(*pai_table.split("."))
        selected_cols = db.pai_selected_cols(pai_table)
    else:
        selected_cols = db.selected_cols(conn.driver, conn, select)

    feature_file_id = 0
    for pred_dmatrix in dpred:
        predict_and_store_result(bst, pred_dmatrix, feature_file_id,
                                 model_params, selected_cols, label_name,
                                 is_pai, conn, result_table,
                                 hdfs_namenode_addr, hive_location, hdfs_user,
                                 hdfs_pass)
        feature_file_id += 1
    print("Done predicting. Predict table : %s" % result_table)
コード例 #12
0
ファイル: explain.py プロジェクト: jeverding/sqlflow
def explain(datasource,
            select,
            feature_field_meta,
            feature_column_names,
            label_spec,
            summary_params,
            result_table="",
            is_pai=False,
            pai_explain_table="",
            hdfs_namenode_addr="",
            hive_location="",
            hdfs_user="",
            hdfs_pass="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                         feature_field_meta, is_pai, pai_explain_table)

    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)

    if result_table != "":
        if is_pai:
            # TODO(typhoonzero): the shape of shap_values is (3, num_samples, num_features)
            # use the first dimension here, should find out how to use the other two.
            write_shap_values(shap_values[0], "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values[0], conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
        return

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(
            lambda: shap.decision_plot(expected_value,
                                       shap_interaction_values,
                                       x,
                                       show=False,
                                       feature_display_range=slice(
                                           None, -40, -1),
                                       alpha=1), is_pai, oss_dest, oss_ak,
            oss_sk, oss_endpoint, oss_bucket_name)
    else:
        explainer.plot_and_save(
            lambda: shap.summary_plot(
                shap_values, x, show=False, **summary_params), is_pai,
            oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
コード例 #13
0
ファイル: input_fn.py プロジェクト: lzj000/sqlflow
def input_fn(select,
             datasource,
             feature_column_names,
             feature_metas,
             label_meta,
             is_pai=False,
             pai_table="",
             num_workers=1,
             worker_id=0):
    feature_types = []
    shapes = []
    for name in feature_column_names:
        # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
        if feature_metas[name]["is_sparse"]:
            feature_types.append((tf.int64, tf.int32, tf.int64))
            shapes.append((None, None, None))
        else:
            feature_types.append(get_dtype(feature_metas[name]["dtype"]))
            shapes.append(feature_metas[name]["shape"])
    if is_pai:
        pai_table = "odps://{}/tables/{}".format(*pai_table.split("."))
        return pai_dataset(pai_table,
                           feature_column_names,
                           label_meta,
                           feature_metas,
                           slice_id=worker_id,
                           slice_count=num_workers)
        selected_cols = db.pai_selected_cols(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)
        gen = db.db_generator(conn.driver, conn, select, feature_column_names,
                              label_meta, feature_metas)
        selected_cols = db.selected_cols(conn.driver, conn, select)

    gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas)

    # Clustering model do not have label
    if not label_meta or label_meta["feature_name"] == "":
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ),
                                                 (tuple(shapes), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    else:
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])),
            (tuple(shapes), label_meta["shape"]))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    return dataset.map(ds_mapper)
コード例 #14
0
 def validate_input_fn():
     if is_pai:
         validate_dataset = pai_maxcompute_input_fn(
             pai_val_table, datasource, feature_column_names, feature_metas,
             label_meta, len(FLAGS.worker_hosts), FLAGS.task_index)
     else:
         conn = connect_with_data_source(datasource)
         validate_dataset = input_fn(validate_select, conn,
                                     feature_column_names, feature_metas,
                                     label_meta)
     validate_dataset = validate_dataset.batch(batch_size)
     return validate_dataset
コード例 #15
0
def pred(datasource,
         select,
         feature_field_meta,
         label_field_meta,
         result_table,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass=""):
    conn = connect_with_data_source(datasource)

    feature_column_names = [k["name"] for k in feature_field_meta]
    label_name = label_field_meta["name"]

    feature_specs = {k['name']: k for k in feature_field_meta}

    dpred = xgb_dataset(conn, 'predict.txt', select, feature_column_names,
                        label_name, feature_specs)

    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    preds = bst.predict(dpred)

    # TODO(Yancey1989): using the train parameters to decide regression model or classifier model
    if len(preds.shape) == 2:
        # classifier result
        preds = np.argmax(np.array(preds), axis=1)
    feature_file_read = open("predict.txt", "r")

    result_column_names = feature_column_names
    result_column_names.append(label_name)
    line_no = 0
    with buffered_db_writer(conn.driver,
                            conn,
                            result_table,
                            result_column_names,
                            100,
                            hdfs_namenode_addr=hdfs_namenode_addr,
                            hive_location=hive_location,
                            hdfs_user=hdfs_user,
                            hdfs_pass=hdfs_pass) as w:
        while True:
            line = feature_file_read.readline()
            if not line:
                break
            row = [
                i.split(":")[1] for i in line.replace("\n", "").split("\t")[1:]
            ]
            row.append(str(preds[line_no]))
            w.write(row)
            line_no += 1
    print("Done predicting. Predict table : %s" % result_table)
コード例 #16
0
ファイル: explain.py プロジェクト: xujiameng/sqlflow
def xgb_shap_dataset(datasource, select, feature_column_names, label_name,
                     feature_specs):
    conn = connect_with_data_source(datasource)
    stream = db_generator(conn.driver, conn, select, feature_column_names,
                          label_name, feature_specs)
    xs = pd.DataFrame(columns=feature_column_names)
    ys = pd.DataFrame(columns=[label_name])
    i = 0
    for row in stream():
        xs.loc[i] = [item[0] for item in row[0]]
        ys.loc[i] = row[1]
        i += 1
    return xs
コード例 #17
0
ファイル: db_test.py プロジェクト: wangjili/sqlflow
    def test_mysql(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "mysql":
            user = os.environ.get('SQLFLOW_TEST_DB_MYSQL_USER') or "root"
            password = os.environ.get('SQLFLOW_TEST_DB_MYSQL_PASSWD') or "root"
            host = "127.0.0.1"
            port = "3306"
            database = "iris"
            conn = connect(driver, database, user=user, password=password, host=host, port=port)
            self._do_test(driver, conn)

            conn = connect_with_data_source("mysql://*****:*****@tcp(127.0.0.1:3306)/iris?maxAllowedPacket=0")
            self._do_test(driver, conn)
コード例 #18
0
def xgb_dataset(datasource,
                fn,
                dataset_sql,
                feature_specs,
                feature_column_names,
                label_spec,
                is_pai=False,
                pai_table="",
                pai_single_file=False,
                cache=False,
                batch_size=None,
                epoch=1,
                rank=0,
                nworkers=1):
    if is_pai:
        for dmatrix in pai_dataset(
                fn,
                feature_specs,
                feature_column_names,
                label_spec,
                "odps://{}/tables/{}".format(*pai_table.split(".")),
                pai_single_file,
                cache,
                rank,
                nworkers,
                batch_size=batch_size):
            yield dmatrix
        return

    conn = db.connect_with_data_source(datasource)
    gen = db.db_generator(conn.driver, conn, dataset_sql, feature_column_names,
                          label_spec, feature_specs)()

    selected_cols = db.selected_cols(conn.driver, conn, dataset_sql)
    for i in range(epoch):
        step = 0
        # the filename per batch is [filename]_[step]
        step_file_name = "%s_%d" % (fn, step)
        written_rows = dump_dmatrix(step_file_name, gen, feature_column_names,
                                    feature_specs, label_spec, selected_cols)

        while written_rows > 0:
            yield load_dmatrix('{0}#{0}.cache'.format(step_file_name)
                               if cache else step_file_name)
            os.remove(step_file_name)

            step += 1
            step_file_name = "%s_%d" % (fn, step)
            written_rows = dump_dmatrix(step_file_name, gen,
                                        feature_column_names, feature_specs,
                                        label_spec, selected_cols)
コード例 #19
0
ファイル: predict.py プロジェクト: Joejiong/sqlflow
def pred(datasource,
         estimator_string,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_column_names_map,
         result_col_name,
         feature_metas={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         is_pai=False,
         pai_table=""):
    # import custom model package
    model_import_name = sqlflow_submitter.import_model_def(estimator_string)
    estimator = eval(estimator_string)

    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    model_params.update(feature_columns)

    is_estimator = issubclass(
        estimator,
        (tf.estimator.Estimator, tf.estimator.BoostedTreesClassifier,
         tf.estimator.BoostedTreesRegressor))
    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table, is_pai,
                      pai_table, feature_column_names, feature_metas,
                      result_col_name, datasource, select, hdfs_namenode_addr,
                      hive_location, hdfs_user, hdfs_pass)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(estimator, model_params, save, result_table,
                          feature_column_names, feature_column_names_map,
                          feature_columns, feature_metas, result_col_name,
                          datasource, select, hdfs_namenode_addr,
                          hive_location, hdfs_user, hdfs_pass, is_pai,
                          pai_table)

    print("Done predicting. Predict table : %s" % result_table)
コード例 #20
0
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, is_pai, pai_table, hdfs_namenode_addr,
                 hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk,
                 oss_endpoint, oss_bucket_name):
    def predict(d):
        if len(d) == 1:
            # This is to make sure the progress bar of SHAP display properly:
            # 1. The newline makes the progress bar string captured in pipe
            # 2. The ASCII control code moves cursor up twice for alignment
            print("\033[A" * 2)

        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d,
                                  columns=shap_dataset.columns))).batch(1000)

        if plot_type == 'bar':
            predictions = [
                p['logits'] if 'logits' in p else p['predictions']
                for p in estimator.predict(input_fn)
            ]
        else:
            predictions = [
                p['logits'][-1] if 'logits' in p else p['predictions'][-1]
                for p in estimator.predict(input_fn)
            ]
        return np.array(predictions)

    if len(shap_dataset) > 100:
        # Reduce to 16 weighted samples to speed up
        shap_dataset_summary = shap.kmeans(shap_dataset, 16)
    else:
        shap_dataset_summary = shap_dataset
    shap_values = shap.KernelExplainer(
        predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic")
    if result_table != "":
        if is_pai:
            write_shap_values(shap_values, "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values, conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
    explainer.plot_and_save(
        lambda: shap.summary_plot(
            shap_values, shap_dataset, show=False, plot_type=plot_type),
        is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
コード例 #21
0
    def test_mysql(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "mysql":
            user, password, host, port, database = testing_mysql_cfg()
            conn = connect(driver,
                           database,
                           user=user,
                           password=password,
                           host=host,
                           port=port)
            self._do_test(driver, conn)

            conn = connect_with_data_source(testing_mysql_db_url())
            self._do_test(driver, conn)
コード例 #22
0
 def train_input_fn():
     # FIXME(typhoonzero): find a way to cache to local file and avoid cache lockfile already exists issue.
     if is_pai:
         train_dataset = pai_maxcompute_input_fn(pai_table, datasource,
                                                 feature_column_names,
                                                 feature_metas, label_meta,
                                                 len(FLAGS.worker_hosts),
                                                 FLAGS.task_index)
     else:
         conn = connect_with_data_source(datasource)
         train_dataset = input_fn(select, conn, feature_column_names,
                                  feature_metas, label_meta)
     train_dataset = train_dataset.shuffle(SHUFFLE_SIZE).batch(
         batch_size).cache().repeat(epochs if epochs else 1)
     return train_dataset
コード例 #23
0
ファイル: predict.py プロジェクト: Phillweston/sqlflow
def pred(datasource,
         estimator,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         result_col_name,
         feature_metas={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         is_pai=False,
         pai_table=""):
    if not is_pai:
        conn = connect_with_data_source(datasource)
    model_params.update(feature_columns)

    is_estimator = issubclass(
        estimator,
        (tf.estimator.Estimator, tf.estimator.BoostedTreesClassifier,
         tf.estimator.BoostedTreesRegressor))
    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, result_col_name,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass)
    else:
        if is_pai:
            FLAGS = define_tf_flags()
            model_params["model_dir"] = FLAGS.checkpointDir
        else:
            model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(estimator, model_params, save, result_table,
                          feature_column_names, feature_metas, result_col_name,
                          datasource, select, hdfs_namenode_addr,
                          hive_location, hdfs_user, hdfs_pass, is_pai,
                          pai_table)

    print("Done predicting. Predict table : %s" % result_table)
コード例 #24
0
ファイル: explain.py プロジェクト: xujiameng/sqlflow
def explain_boosted_trees(datasource, estimator, input_fn, plot_type,
                          result_table, feature_column_names,
                          hdfs_namenode_addr, hive_location, hdfs_user,
                          hdfs_pass):
    result = estimator.experimental_predict_with_explanations(input_fn)
    pred_dicts = list(result)
    df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts])
    dfc_mean = df_dfc.abs().mean()
    if result_table != "":
        conn = connect_with_data_source(datasource)
        gain = estimator.experimental_feature_importances(normalize=True)
        create_explain_result_table(conn, result_table)
        write_dfc_result(dfc_mean, gain, result_table, conn,
                         feature_column_names, hdfs_namenode_addr,
                         hive_location, hdfs_user, hdfs_pass)
    explainer.plot_and_save(lambda: eval(plot_type)(df_dfc))
コード例 #25
0
ファイル: input_fn.py プロジェクト: jeverding/sqlflow
def input_fn(select,
             datasource,
             feature_column_names,
             feature_metas,
             label_meta,
             is_pai=False,
             pai_table=""):
    feature_types = []
    shapes = []
    for name in feature_column_names:
        # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
        if feature_metas[name]["is_sparse"]:
            feature_types.append((tf.int64, tf.int32, tf.int64))
            shapes.append((None, None, None))
        else:
            feature_types.append(get_dtype(feature_metas[name]["dtype"]))
            shapes.append(feature_metas[name]["shape"])
    if is_pai:
        pai_table_parts = pai_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        gen = pai_maxcompute_db_generator(formated_pai_table,
                                          feature_column_names,
                                          label_meta["feature_name"],
                                          feature_metas)
    else:
        conn = connect_with_data_source(datasource)
        gen = db_generator(conn.driver, conn, select, feature_column_names,
                           label_meta, feature_metas)
    # Clustering model do not have label
    if label_meta["feature_name"] == "":
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ),
                                                 (tuple(shapes), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    else:
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])),
            (tuple(shapes), label_meta["shape"]))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    return dataset.map(ds_mapper)
コード例 #26
0
ファイル: evaluate.py プロジェクト: zhikecore/sqlflow
def evaluate(datasource,
             select,
             feature_metas,
             feature_column_names,
             label_meta,
             result_table,
             validation_metrics=["accuracy_score"],
             is_pai=False,
             hdfs_namenode_addr="",
             hive_location="",
             hdfs_user="",
             hdfs_pass="",
             pai_table="",
             model_params=None,
             transform_fn=None,
             feature_column_code=""):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = None
    dpred = xgb_dataset(datasource,
                        'predict.txt',
                        select,
                        feature_metas,
                        feature_column_names,
                        label_meta,
                        is_pai,
                        pai_table,
                        True,
                        True,
                        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
                        transform_fn=transform_fn,
                        feature_column_code=feature_column_code
                        )  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load model
    print("Start evaluating XGBoost model...")
    feature_file_id = 0
    for pred_dmatrix in dpred:
        evaluate_and_store_result(bst, pred_dmatrix, feature_file_id,
                                  validation_metrics, model_params,
                                  feature_column_names, label_meta, is_pai,
                                  conn, result_table, hdfs_namenode_addr,
                                  hive_location, hdfs_user, hdfs_pass)
        feature_file_id += 1
    print("Done evaluating. Result table : %s" % result_table)
コード例 #27
0
ファイル: predict.py プロジェクト: zdfccDanfeng/sqlflow
def pred(datasource,
         select,
         feature_metas,
         feature_column_names,
         label_meta,
         result_table,
         is_pai=False,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         pai_table="",
         model_params=None,
         train_params=None):
    # TODO(typhoonzero): support running on PAI without MaxCompute AK/SK connection.
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = None
    label_name = label_meta["feature_name"]
    dpred = xgb_dataset(datasource,
                        'predict.txt',
                        select,
                        feature_metas,
                        feature_column_names,
                        None,
                        is_pai,
                        pai_table,
                        True,
                        True,
                        batch_size=DEFAULT_PREDICT_BATCH_SIZE
                        )  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("Start predicting XGBoost model...")
    feature_file_id = 0
    for pred_dmatrix in dpred:
        predict_and_store_result(bst, pred_dmatrix, feature_file_id,
                                 model_params, feature_column_names,
                                 label_name, is_pai, conn, result_table,
                                 hdfs_namenode_addr, hive_location, hdfs_user,
                                 hdfs_pass)
        feature_file_id += 1
    print("Done predicting. Predict table : %s" % result_table)
コード例 #28
0
ファイル: predict.py プロジェクト: CZZLEGEND/sqlflow
def pred(is_keras_model,
         datasource,
         estimator,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_metas={},
         label_meta={},
         model_params={},
         save="",
         batch_size=1,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         is_pai=False,
         pai_table=""):
    if not is_pai:
        conn = connect_with_data_source(datasource)
    model_params.update(feature_columns)

    if is_keras_model:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, label_meta,
                      datasource, select, hdfs_namenode_addr, hive_location,
                      hdfs_user, hdfs_pass)
    else:
        if is_pai:
            FLAGS = define_tf_flags()
            model_params["model_dir"] = FLAGS.checkpointDir
        else:
            model_params['model_dir'] = save
        estimator_predict(estimator, model_params, save, result_table,
                          feature_column_names, feature_metas, label_meta,
                          datasource, select, hdfs_namenode_addr,
                          hive_location, hdfs_user, hdfs_pass, is_pai,
                          pai_table)

    print("Done predicting. Predict table : %s" % result_table)
コード例 #29
0
ファイル: dataset.py プロジェクト: xhcom-ui/sqlflow
def xgb_dataset(datasource,
                fn,
                dataset_sql,
                feature_specs,
                feature_column_names,
                label_spec,
                is_pai=False,
                pai_table="",
                pai_single_file=False):

    if is_pai:
        pai_dataset(fn, feature_specs, feature_column_names, label_spec,
                    "odps://{}/tables/{}".format(*pai_table.split(".")),
                    pai_single_file)
    else:
        conn = db.connect_with_data_source(datasource)
        gen = db.db_generator(conn.driver, conn, dataset_sql,
                              feature_column_names, label_spec, feature_specs)
        dump_dmatrix(fn, gen, label_spec)
    return xgb.DMatrix(fn)
コード例 #30
0
ファイル: explain.py プロジェクト: jeverding/sqlflow
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                     feature_specs, is_pai, pai_explain_table):
    label_column_name = label_spec["feature_name"]
    if is_pai:
        pai_table_parts = pai_explain_table.split(".")
        formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0],
                                                      pai_table_parts[1])
        stream = pai_maxcompute_db_generator(formated_pai_table,
                                             feature_column_names,
                                             label_column_name, feature_specs)
    else:
        conn = connect_with_data_source(datasource)
        stream = db_generator(conn.driver, conn, select, feature_column_names,
                              label_spec, feature_specs)

    xs = pd.DataFrame(columns=feature_column_names)
    i = 0
    for row in stream():
        xs.loc[i] = [item[0] for item in row[0]]
        i += 1
    return xs