Beispiel #1
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref",
                                        "./data/adult_income-prepped.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["income"]
    X = data.drop("income", axis=1)

    y_train_df = train_data["income"]
    X_train_df = train_data.drop("income", axis=1)

    y_test_df = test_data["income"]
    X_test_df = test_data.drop("income", axis=1)

    # create encoder on entire dataset
    scaler = Encoder(X)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    xgbt = xgb.XGBClassifier(objective="binary:logistic",
                             random_state=RANDOM_SEED)
    xgbt.fit(X_train, y_train)
    y_pred = xgbt.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    xgbt_acc = accuracy_score(y_test, predictions)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(
        xgbt,
        scaler,
        "XGBoost",
        xgbt_acc,
        "Extreme Gradient Boosting Classifier",
        model_binary,
    )
    print(xgbt_acc)
    return f"model: {model_binary}"
Beispiel #2
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get(
        "$ref", "./data/auto_insurance_claims_dataset.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["Total Claim Amount"]
    X = data.drop("Total Claim Amount", axis=1)

    y_train_df = train_data["Total Claim Amount"]
    X_train_df = train_data.drop("Total Claim Amount", axis=1)

    y_test_df = test_data["Total Claim Amount"]
    X_test_df = test_data.drop("Total Claim Amount", axis=1)

    # create encoder on entire dataset
    scaler = CategoricalEncoder()
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training

    rf = RandomForestRegressor(n_estimators=10,
                               max_depth=11,
                               bootstrap=True,
                               random_state=RANDOM_SEED)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    rf_err = ((y_test - y_pred)**2).sum()  # Prediction error

    err = r2_score(y_test, y_pred)

    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(rf, scaler, "Random forest", err, "Random forest Regressor",
                 model_binary)
    print(err)
    return f"model: {model_binary}"
Beispiel #3
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref",
                                        "./data/bank_marketing-prepped.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["deposit"]
    X = data.drop("deposit", axis=1)

    y_train_df = train_data["deposit"]
    X_train_df = train_data.drop("deposit", axis=1)

    y_test_df = test_data["deposit"]
    X_test_df = test_data.drop("deposit", axis=1)

    # create encoder on entire dataset
    scaler = Encoder(X)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model trainings
    dtree = DecisionTreeClassifier(criterion="entropy",
                                   random_state=RANDOM_SEED)
    dtree.fit(X_train.values, y_train.values)

    dtree_acc = dtree.score(X_test.values, y_test.values)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(
        dtree,
        scaler,
        "Decision Tree",
        dtree_acc,
        "Decision Tree Classifier",
        model_binary,
    )
    print(dtree_acc)
    return f"model: {model_binary}"
Beispiel #4
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref", "./data/diabetes.csv")
    save_model_as = msg.payload.get("model_name")

    data = prep_diabetes_dataset(pd.read_csv(training_data_uri))
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = prep_diabetes_dataset(pd.read_csv(train_dataset))
    test_data = prep_diabetes_dataset(pd.read_csv(test_dataset))

    # Separate outcome
    y = data["Outcome"]
    X = data.drop("Outcome", axis=1)

    y_train_df = train_data["Outcome"]
    X_train_df = train_data.drop("Outcome", axis=1)

    y_test_df = test_data["Outcome"]
    X_test_df = test_data.drop("Outcome", axis=1)

    # create encoder on entire dataset
    scaler = WrappedStandardScaler(copy=True, with_mean=True, with_std=True)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    dtree = DecisionTreeClassifier(criterion="entropy",
                                   random_state=RANDOM_SEED)
    dtree.fit(X_train, y_train)
    dtree_acc = dtree.score(X_test, y_test)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(
        dtree,
        scaler,
        "Decision Tree",
        dtree_acc,
        "Basic Decision Tree model'",
        model_binary,
    )
    print(dtree_acc)
    return f"model: {model_binary}"
Beispiel #5
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref",
                                        "./data/customer_churn-prepped.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["Exited"]
    X = data.drop("Exited", axis=1)

    y_train_df = train_data["Exited"]
    X_train_df = train_data.drop("Exited", axis=1)

    y_test_df = test_data["Exited"]
    X_test_df = test_data.drop("Exited", axis=1)

    # create encoder on entire dataset
    scaler = Encoder(X)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    rfMod = RandomForestClassifier(n_estimators=10,
                                   criterion="gini",
                                   random_state=RANDOM_SEED)
    rfMod.fit(X_train, y_train)
    # Compute the model accuracy on the given test data and labels
    rf_acc = rfMod.score(X_test, y_test)

    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(rfMod, scaler, "GBM", rf_acc, "Gradient Boosting Model",
                 model_binary)
    print(rf_acc)
    return f"model: {model_binary}"
Beispiel #6
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get(
        "$ref", "./data/auto_insurance_claims_dataset.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["Total Claim Amount"]
    X = data.drop("Total Claim Amount", axis=1)

    y_train_df = train_data["Total Claim Amount"]
    X_train_df = train_data.drop("Total Claim Amount", axis=1)

    y_test_df = test_data["Total Claim Amount"]
    X_test_df = test_data.drop("Total Claim Amount", axis=1)

    # create encoder on entire dataset
    scaler = CategoricalEncoder()
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training

    svr = SVR()
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    err = r2_score(y_test, y_pred)

    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(svr, scaler, "Support vector regressor", err,
                 "Support vector regressor", model_binary)
    print(err)
    return f"model: {model_binary}"
Beispiel #7
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref", "./data/diabetes.csv")
    save_model_as = msg.payload.get("model_name")

    data = prep_diabetes_dataset(pd.read_csv(training_data_uri))
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = prep_diabetes_dataset(pd.read_csv(train_dataset))
    test_data = prep_diabetes_dataset(pd.read_csv(test_dataset))

    # Separate outcome
    y = data["Outcome"]
    X = data.drop("Outcome", axis=1)

    y_train_df = train_data["Outcome"]
    X_train_df = train_data.drop("Outcome", axis=1)

    y_test_df = test_data["Outcome"]
    X_test_df = test_data.drop("Outcome", axis=1)

    # create encoder on entire dataset
    scaler = WrappedStandardScaler(copy=True, with_mean=True, with_std=True)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    logit = LogisticRegression(random_state=RANDOM_SEED,
                               solver="lbfgs").fit(X_train, y_train)
    logit.score(X_test, y_test)
    logit_acc = logit.score(X_test, y_test)

    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(logit, scaler, "LR", logit_acc,
                 "Logistic Regression Classifier", model_binary)
    print(logit_acc)
    return f"model: {model_binary}"
Beispiel #8
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref",
                                        "./data/bank_marketing-prepped.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["deposit"]
    X = data.drop("deposit", axis=1)

    y_train_df = train_data["deposit"]
    X_train_df = train_data.drop("deposit", axis=1)

    y_test_df = test_data["deposit"]
    X_test_df = test_data.drop("deposit", axis=1)

    # create encoder on entire dataset
    scaler = Encoder(X)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    mlp = MLPClassifier(random_state=RANDOM_SEED)
    mlp.fit(X_train, y_train)
    mlp.score(X_test, y_test)
    mlp_acc = mlp.score(X_test, y_test)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(mlp, scaler, "MLP", mlp_acc, "Basic MLP classifier",
                 model_binary)
    print(mlp_acc)
    return f"model: {model_binary}"
Beispiel #9
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref", "./data/diabetes.csv")
    save_model_as = msg.payload.get("model_name")

    data = prep_diabetes_dataset(pd.read_csv(training_data_uri))
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = prep_diabetes_dataset(pd.read_csv(train_dataset))
    test_data = prep_diabetes_dataset(pd.read_csv(test_dataset))

    # Separate outcome
    y = data["Outcome"]
    X = data.drop("Outcome", axis=1)

    y_train_df = train_data["Outcome"]
    X_train_df = train_data.drop("Outcome", axis=1)

    y_test_df = test_data["Outcome"]
    X_test_df = test_data.drop("Outcome", axis=1)

    # create encoder on entire dataset
    scaler = WrappedStandardScaler(copy=True, with_mean=True, with_std=True)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    mlp = MLPClassifier(hidden_layer_sizes=(20, 20),
                        max_iter=1000,
                        random_state=RANDOM_SEED)
    mlp.fit(X_train, y_train)
    mlp.score(X_test, y_test)
    mlp_acc = mlp.score(X_test, y_test)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(mlp, scaler, "MLP", mlp_acc, "Basic MLP model", model_binary)
    print(mlp_acc)
    return f"model: {model_binary}"
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref",
                                        "./data/german_credit-decoded.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["outcome"]
    X = data.drop("outcome", axis=1)

    y_train_df = train_data["outcome"]
    X_train_df = train_data.drop("outcome", axis=1)

    y_test_df = test_data["outcome"]
    X_test_df = test_data.drop("outcome", axis=1)

    # create encoder on entire dataset
    scaler = Encoder()
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df
    # start model training
    logit = LogisticRegression(random_state=RANDOM_SEED,
                               solver="lbfgs",
                               max_iter=1000)
    logit.fit(X_train.values, y_train.values)
    logit_acc = logit.score(X_test.values, y_test.values)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(logit, scaler, "LR", logit_acc,
                 "Logistic Regression Classifier", model_binary)
    print(logit_acc)
    return f"model: {model_binary}"
Beispiel #11
0
def train(msg):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get(
        "$ref", "./data/heart_disease_multiclass-prepped.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["class_att"]
    X = data.drop("class_att", axis=1)

    y_train_df = train_data["class_att"]
    X_train_df = train_data.drop("class_att", axis=1)

    y_test_df = test_data["class_att"]
    X_test_df = test_data.drop("class_att", axis=1)

    # create encoder on entire dataset
    scaler = CategoricalEncoder(X)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    rf_acc = rf.score(X_test, y_test)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(rf, scaler, 'RF', rf_acc, 'Random Forest Classifier',
                 model_binary)
    print(rf_acc)
    return f"model: {model_binary}"
Beispiel #12
0
def train(msg):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get(
        "$ref", "./data/heart_disease_multiclass-prepped.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["class_att"]
    X = data.drop("class_att", axis=1)

    y_train_df = train_data["class_att"]
    X_train_df = train_data.drop("class_att", axis=1)

    y_test_df = test_data["class_att"]
    X_test_df = test_data.drop("class_att", axis=1)

    # create encoder on entire dataset
    scaler = CategoricalEncoder(X)
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    logit = LogisticRegression(random_state=RANDOM_SEED, solver='lbfgs')
    logit.fit(X_train.values, y_train.values)
    logit_acc = logit.score(X_test.values, y_test.values)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(logit, scaler, 'LR', logit_acc,
                 'Logistic Regression Classifier', model_binary)
    print(logit_acc)
    return f"model: {model_binary}"
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get("$ref",
                                        "./data/german_credit-decoded.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["outcome"]
    X = data.drop("outcome", axis=1)

    y_train_df = train_data["outcome"]
    X_train_df = train_data.drop("outcome", axis=1)

    y_test_df = test_data["outcome"]
    X_test_df = test_data.drop("outcome", axis=1)

    # create encoder on entire dataset
    scaler = Encoder()
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    SVM = svm.SVC(gamma="scale", random_state=RANDOM_SEED, probability=True)
    SVM.fit(X_train.values, y_train.values)
    svm_acc = SVM.score(X_test.values, y_test.values)
    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(SVM, scaler, "SVM", svm_acc, "Basic SVM model", model_binary)
    print(svm_acc)
    return f"model: {model_binary}"
Beispiel #14
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)

    training_data_uri = msg.payload.get(
        "$ref", "./data/auto_insurance_claims_dataset.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["Total Claim Amount"]
    X = data.drop("Total Claim Amount", axis=1)

    y_train_df = train_data["Total Claim Amount"]
    X_train_df = train_data.drop("Total Claim Amount", axis=1)

    y_test_df = test_data["Total Claim Amount"]
    X_test_df = test_data.drop("Total Claim Amount", axis=1)

    # create encoder on entire dataset
    scaler = CategoricalEncoder()
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    lmodel_l2 = Ridge(
        alpha=0.1,
        copy_X=True,
        fit_intercept=True,
        max_iter=1000,
        normalize=False,
        random_state=RANDOM_SEED,
        tol=0.0001,
    )
    lmodel_l2.fit(X_train, y_train)
    y_pred = lmodel_l2.predict(X_test)
    l2_err = ((y_test - y_pred)**2).sum()  # Prediction error

    err = r2_score(y_test, y_pred)

    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(
        lmodel_l2,
        scaler,
        "Linear L2",
        err,
        "Linear regression with L2 regularization",
        model_binary,
    )
    print(err)
    return f"model: {model_binary}"
Beispiel #15
0
def train(msg):
    # for reproducible training
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    set_random_seed(RANDOM_SEED)

    training_data_uri = msg.payload.get(
        "$ref", "./data/auto_insurance_claims_dataset.csv")
    save_model_as = msg.payload.get("model_name")

    data = pd.read_csv(training_data_uri)
    train_dataset = training_data_uri.replace(".csv", "-train.csv")
    test_dataset = training_data_uri.replace(".csv", "-test.csv")
    train_data = pd.read_csv(train_dataset)
    test_data = pd.read_csv(test_dataset)

    # Separate outcome
    y = data["Total Claim Amount"]
    X = data.drop("Total Claim Amount", axis=1)

    y_train_df = train_data["Total Claim Amount"]
    X_train_df = train_data.drop("Total Claim Amount", axis=1)

    y_test_df = test_data["Total Claim Amount"]
    X_test_df = test_data.drop("Total Claim Amount", axis=1)

    # create encoder on entire dataset
    scaler = CategoricalEncoder()
    scaler.fit(X)

    # apply encoding to train and test data features
    # applied on test data to calculate accuracy metric
    X_train = scaler.transform(X_train_df)
    y_train = y_train_df

    X_test = scaler.transform(X_test_df)
    y_test = y_test_df

    # start model training
    NN = Sequential()
    NN.add(Dense(1000, input_dim=X_train.shape[1], activation="relu"))
    NN.add(Dense(200, activation="relu"))
    NN.add(Dense(50, activation="relu"))
    NN.add(Dense(1))
    NN.summary()

    NN.compile(loss="mse", optimizer="adam", metrics=["mse", "mae"])
    NN.fit(X_train, y_train, epochs=500, batch_size=300, verbose=0)

    y_pred = NN.predict(X_test)
    err = r2_score(y_test, y_pred)

    model_binary = f"models/{save_model_as}.pkl"
    pickle_model(
        NNPredictWrapper(NN),
        scaler,
        "Neural Network",
        err,
        "Four-layer neural network",
        model_binary,
    )
    print(err)
    return f"model: {model_binary}"