コード例 #1
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None)

    feature_names = ["NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age"]
    target_name = "HasDiabetes"

    # read training dataset from Teradata and convert to pandas
    train_df = DataFrame(data_conf["table"])
    train_df = train_df.select([feature_names + [target_name]])
    train_pdf = train_df.to_pandas()

    # split data into X and y
    X_train = train_pdf.drop(target_name, 1)
    y_train = train_pdf[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb', XGBClassifier(eta=hyperparams["eta"],
                                            max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access later
    model.feature_names = feature_names
    model.target_name = target_name

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")

    from xgboost import plot_importance
    model["xgb"].get_booster().feature_names = feature_names
    plot_importance(model["xgb"].get_booster(), max_num_features=10)
    save_plot("feature_importance.png")

    feature_importance = model["xgb"].get_booster().get_score(importance_type="weight")
    stats.record_stats(train_df,
                       features=feature_names,
                       predictors=["HasDiabetes"],
                       categorical=["HasDiabetes"],
                       importance=feature_importance,
                       category_labels={"HasDiabetes": {0: "false", 1: "true"}})
コード例 #2
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    feature_names = [
        "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns",
        "BMI", "DiPedFunc", "Age"
    ]
    target_name = "HasDiabetes"

    # in a real world scenario, you would read from S3, HDFS, Teradata,
    # etc but for demo reading from url. we could read via pandas.read_csv but just to show pyspark ...
    urllib.request.urlretrieve(data_conf["url"], "/tmp/data.csv")
    all_columns = feature_names + [target_name]
    train_df = spark.read.format("csv")\
        .option("inferSchema", "true")\
        .load("/tmp/data.csv")\
        .toDF(*all_columns)

    # do feature eng in spark / joins whatever reason you're using pyspark...
    # split into test and train
    train_df = train_df.randomSplit([0.7, 0.3], 42)[0].toPandas()

    # split data into X and y
    X_train = train_df.drop(target_name, 1)
    y_train = train_df[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb',
                       XGBClassifier(eta=hyperparams["eta"],
                                     max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access
    model.feature_names = feature_names

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model,
                    col_names=feature_names[0:8],
                    target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")
コード例 #3
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    feature_names = [
        "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns",
        "BMI", "DiPedFunc", "Age"
    ]
    target_name = "HasDiabetes"

    # read training dataset from Teradata and convert to pandas
    train_df = DataFrame(data_conf["table"])
    train_df = train_df.select([feature_names + [target_name]])
    train_df = train_df.to_pandas()

    # split data into X and y
    X_train = train_df.drop(target_name, 1)
    y_train = train_df[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb',
                       XGBClassifier(eta=hyperparams["eta"],
                                     max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access later
    model.feature_names = feature_names
    model.target_name = target_name

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model,
                    col_names=feature_names,
                    target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")
コード例 #4
0
ファイル: training.py プロジェクト: trishlugtu/AoaDemoModels
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    feature_names = [
        "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns",
        "BMI", "DiPedFunc", "Age"
    ]
    target_name = "HasDiabetes"

    train_df = read_dataframe(spark, data_conf["url"])

    # do feature eng in spark / joins whatever reason you're using pyspark...
    # split into test and train
    train_df = train_df.randomSplit([0.7, 0.3], 42)[0].toPandas()

    # split data into X and y
    X_train = train_df.drop(target_name, 1)
    y_train = train_df[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb',
                       XGBClassifier(eta=hyperparams["eta"],
                                     max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access
    model.feature_names = feature_names

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model,
                    col_names=feature_names[0:8],
                    target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")