Ejemplo n.º 1
0
    def test_fit_and_predict(self):
        metric = Metric({"name": "logloss"})

        automl = AutoML(
            total_time_limit=5,
            algorithms=["Xgboost"],
            start_random_models=5,
            hill_climbing_steps=0,
            seed=13,
        )
        automl.fit(self.X, self.y)

        y_predicted = automl.predict(self.X)["p_1"]
        self.assertTrue(y_predicted is not None)
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.7)

        params = automl.to_json()
        automl2 = AutoML()
        automl2.from_json(params)

        y_predicted2 = automl2.predict(self.X)["p_1"]
        self.assertTrue(y_predicted2 is not None)
        loss2 = metric(self.y, y_predicted2)
        self.assertTrue(loss2 < 0.7)

        assert_almost_equal(automl._threshold, automl2._threshold)
Ejemplo n.º 2
0
def run(dataset, config):
    log.info("\n**** mljar-supervised ****\n")

    column_names, _ = zip(*dataset.columns)
    column_types = dict(dataset.columns)
    X_train = pd.DataFrame(dataset.train.X,
                           columns=column_names).astype(column_types,
                                                        copy=False)
    X_test = pd.DataFrame(dataset.test.X,
                          columns=column_names).astype(column_types,
                                                       copy=False)

    y_train = dataset.train.y.flatten()
    y_test = dataset.test.y.flatten()

    problem_mapping = dict(
        binary="binary_classification",
        multiclass="multiclass_classification",
        regression="regression",
    )
    is_classification = config.type == "classification"
    ml_task = problem_mapping.get(
        dataset.problem_type
    )  # if None the AutoML will guess about the ML task
    results_path = output_subdir("results", config)
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith("_")
    }

    automl = AutoML(results_path=results_path,
                    total_time_limit=config.max_runtime_seconds,
                    seed=config.seed,
                    ml_task=ml_task,
                    **training_params)

    with Timer() as training:
        automl.fit(X_train, y_train)

    preds = automl.predict(X_test)

    predictions, probabilities = None, None
    if is_classification:
        predictions = preds["label"].values
        probabilities = preds[preds.columns[:-1]].values
    else:
        predictions = preds["prediction"].values

    # clean the results
    if not config.framework_params.get("_save_artifacts", False):
        shutil.rmtree(results_path, ignore_errors=True)

    return result(
        output_file=config.output_predictions_file,
        predictions=predictions,
        truth=y_test,
        probabilities=probabilities,
        models_count=len(automl._models),
        training_duration=training.duration,
    )
    def test_fit_and_predict(self):
        seed = 1709

        df = pd.read_csv(
            "./tests/data/housing_regression_missing_values_missing_target.csv"
        )
        print(df.columns)
        x_cols = [c for c in df.columns if c != "MEDV"]
        X = df[x_cols]
        y = df["MEDV"]

        X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
            X, y, test_size=0.3, random_state=seed)
        automl = AutoML(
            total_time_limit=10,
            algorithms=["Xgboost"
                        ],  # ["LightGBM", "RF", "NN", "CatBoost", "Xgboost"],
            start_random_models=1,
            hill_climbing_steps=0,
            top_models_to_improve=0,
            train_ensemble=True,
            verbose=True,
        )
        automl.fit(X_train, y_train)

        response = automl.predict(X_test)  # ["p_1"]
        print("Response", response)
    def test_fit_and_predict(self):
        seed = 1706 + 1
        for dataset_id in [31]:  # 720 # 31,44,737
            df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id))
            x_cols = [c for c in df.columns if c != "target"]
            X = df[x_cols]
            y = df["target"]

            X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
                X, y, test_size=0.3, random_state=seed)
            automl = AutoML(
                total_time_limit=60 * 6000,
                algorithms=["LightGBM", "RF", "NN", "CatBoost", "Xgboost"],
                start_random_models=10,
                hill_climbing_steps=3,
                top_models_to_improve=3,
                train_ensemble=True,
                verbose=True,
            )
            automl.fit(X_train, y_train)

            response = automl.predict(X_test)
            # Compute the logloss on test dataset
            ll = log_loss(y_test, response)
            print("(*) Dataset id {} logloss {}".format(dataset_id, ll))

            for i, m in enumerate(automl._models):
                response = m.predict(X_test)
                ll = log_loss(y_test, response)
                print("{}) Dataset id {} logloss {}".format(i, dataset_id, ll))
    def test_fit_and_predict(self):

        for dataset_id in [3, 24, 31, 38, 44, 179, 737, 720]:
            df = pd.read_csv("./tests/data/{0}.csv".format(dataset_id))
            x_cols = [c for c in df.columns if c != "target"]
            X = df[x_cols]
            y = df["target"]

            for repeat in range(1):

                X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
                    X, y, test_size=0.3, random_state=1706 + repeat)
                automl = AutoML(
                    total_time_limit=60 * 1,  # 1h limit
                    algorithms=[
                        "Xgboost"
                    ],  # ["LightGBM", "CatBoost", "Xgboost", "RF", "NN"],
                    start_random_models=3,
                    hill_climbing_steps=1,
                    top_models_to_improve=1,
                    train_ensemble=True,
                    verbose=True,
                )
                automl.fit(X_train, y_train)

                response = automl.predict(X_test)["p_1"]
                labels = automl.predict(X_test)["label"]

                # Compute the logloss on test dataset
                ll = log_loss(y_test, response)
                f1 = f1_score(y_test, labels)
                print("iter: {}) id:{} logloss:{} f1:{} time:{}".format(
                    repeat, dataset_id, ll, f1, automl._fit_time))
                with open("./result.txt", "a") as f_result:
                    f_result.write("{} {} {} {} {}\n".format(
                        repeat, dataset_id, ll, f1, automl._fit_time))
Ejemplo n.º 6
0
    def test_predict_labels(self):
        # 3.csv') #
        df = pd.read_csv(
            'tests/data/adult_missing_values_missing_target_500rows.csv')
        X = df[df.columns[:-1]]
        y = df[df.columns[-1]]
        automl = AutoML(total_time_limit=15,
                        algorithms=["Xgboost"],
                        start_random_models=5,
                        hill_climbing_steps=0,
                        train_ensemble=True)
        automl.fit(X, y)

        y_predicted = automl.predict(X)
        self.assertTrue('A' in np.unique(y_predicted['label']))
        self.assertTrue('B' in np.unique(y_predicted['label']))
Ejemplo n.º 7
0
def train_titanic(train_data):
    train_df = pd.read_csv(train_data)
    # test_df = pd.read_csv(test_data)

    # feature_cols = train_df.drop(['Survived', 'PassengerId', 'Name'], axis=1).columns
    feature_cols = train_df.columns[2:]
    target_cols = 'Survived'

    X_train, X_test, y_train, y_test = train_test_split(train_df[feature_cols],
                                                        train_df[target_cols],
                                                        test_size=0.25)

    automl = AutoML(results_path="AutoML_titanic")
    automl.fit(X_train, y_train)

    predictions = automl.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, predictions) * 100.0:.2f}%")
Ejemplo n.º 8
0
 def test_reproduce_fit(self):
     metric = Metric({"name": "logloss"})
     losses = []
     for i in range(2):
         automl = AutoML(
             total_time_limit=
             10000,  # the time limit should be big enough too not interrupt the training
             algorithms=["Xgboost"],
             start_random_models=2,
             hill_climbing_steps=1,
             train_ensemble=True,
             verbose=True,
             seed=12,
         )
         automl.fit(self.X, self.y)
         y_predicted = automl.predict(self.X)["p_1"]
         loss = metric(self.y, y_predicted)
         losses += [loss]
     assert_almost_equal(losses[0], losses[1], decimal=4)
Ejemplo n.º 9
0
import pandas as pd

# scikit learn utilites
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# mljar-supervised package
from supervised.automl import AutoML

# Load the data
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(digits.data),
                                                    digits.target,
                                                    stratify=digits.target,
                                                    test_size=0.25)

# train models
automl = AutoML(mode="Perform")
automl.fit(X_train, y_train)

# compute the accuracy on test data
predictions = automl.predict(X_test)
print(predictions.head())
print("Test accuracy:", accuracy_score(y_test,
                                       predictions["label"].astype(int)))
Ejemplo n.º 10
0
automl = AutoML(mode="Explain")
automl.fit(X, y)
pred = automl.predict(X)

print("Train accuracy", accuracy_score(y, pred))
test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv")
pred = automl.predict(test)
print("Test accuracy", accuracy_score(test["Survived"], pred))
'''

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from supervised import AutoML

train = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv")
print(train.head())

X = train[train.columns[2:]]
y = train["Survived"]

#automl = AutoML(mode="Compete") # default mode is Explain
automl = AutoML(algorithms=["Decision Tree"]) # default mode is Explain

automl.fit(X, y)

test = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv")
predictions = automl.predict(test)
print(predictions)
print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%" )
    category=pd.core.common.SettingWithCopyWarning)  # message="*ndarray*")

# df = pd.read_csv("tests/data/iris_classes_missing_values_missing_target.csv")
df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv")
X = df[["feature_1", "feature_2", "feature_3", "feature_4"]]
y = df["class"]

automl = AutoML(

    # results_path="AutoML_41",
    # algorithms=["CatBoost"],
    #algorithms=["Neural Network"],
    #    "Linear",
    #    "Xgboost",
    #    "Random Forest"
    # ],
    #total_time_limit=100,
    #tuning_mode="Normal",
    #explain_level=0,
    mode="Perform")
# automl.set_advanced(start_random_models=1)
automl.fit(X, y)

predictions = automl.predict(X)

print(predictions.head())
print(predictions.tail())

print(X.shape)
print(predictions.shape)
Ejemplo n.º 12
0
    if (len(ratedf) == 0):
        continue
    featvals_stoch.append("stoch20" + "(" + str(0.1 * (i + 1)) + ")")
    ratedf = k1[(k1.stoch20 > i * 0.1) & (k1.stoch20 < (i + 1) * 0.1)]
    featrate_stoch.append(len(ratedf[ratedf.rets_long == 1]) / (len(ratedf)))

k1 = k1.dropna()
feats = [
    'close_diff', 'gap1', 'rsi5', 'rsi5_smoothed', 'gap', 'stoch20', 'stoch14',
    'rsi14', 'rsi20', 'sine', 'bandpass', 'cci', 'decycle', 'quadlead',
    'velacc', 'VIX_Close', 'VIX_Close_diff', 'h', 'rsi20_diff', 'rsi14_diff',
    'stoch20_diff', 'res1', 'res2', 'res3', 'res4', 'res5'
]
feats1 = feats
xtrain = np.array(k1[feats1])
ytrain = np.array(k1.rets_long)
tr = RandomForestClassifier(n_estimators=550,
                            max_depth=6,
                            min_samples_split=10)
clf = tr
#clf = AdaBoostClassifier(base_estimator=tr,n_estimators=80,random_state=50,learning_rate=1.0)
automl.fit(xtrain, ytrain)
k2['predictions'] = automl.predict(np.array(k2[feats1]))
k2['rand_preds'] = [random.choice([0, 1]) for _ in range(len(k2))]
print("accs:")
print(len(k2[k2.predictions == k2.rets1]) / (len(k2)))
print("rets original:")
print(k2.rets.sum())
print("rets model")
print((k2.predictions * k2.rets).sum())
Ejemplo n.º 13
0
df = pd.read_csv(
    "./tests/data/housing_regression_missing_values_missing_target.csv")
x_cols = [c for c in df.columns if c != "MEDV"]
X = df[x_cols]
y = df["MEDV"]

print("y", y[:10])

print(X.shape)

automl = AutoML(
    #results_path="AutoML_43",
    #total_time_limit=100,
    #algorithms=["Linear"],
    # "Decision Tree",
    # ,
    # "Extra Trees"
    #],
    #explain_level=0,
    #tuning_mode="Normal"
    mode='Explain',
    #train_ensemble = True
)
#automl.set_advanced(start_random_models=1)
automl.fit(X, y)

df["predictions"] = automl.predict(X)
print("Predictions")
print(df[["MEDV", "predictions"]].head())
Ejemplo n.º 14
0
import pandas as pd
import numpy as np
from supervised.automl import AutoML
import os

from sklearn.metrics import accuracy_score

df = pd.read_csv("tests/data/Titanic/train.csv")

X = df[df.columns[2:]]
y = df["Survived"]

automl = AutoML(mode="Explain")
automl.fit(X, y)
pred = automl.predict(X)

print("Train accuracy", accuracy_score(y, pred))
test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv")
pred = automl.predict(test)
print("Test accuracy", accuracy_score(test["Survived"], pred))
Ejemplo n.º 15
0
    nn = AutoML(
        algorithms=["Neural Network"],
        mode="Perform",
        explain_level=0,
        train_ensemble=False,
        golden_features=False,
        features_selection=False,
        ml_task="regression",
    )

    mlp.fit(train_X, train_y)
    mlp_time = np.round(time.time() - mlp._start_time, 2)
    nn.fit(train_X, train_y)
    nn_time = np.round(time.time() - nn._start_time, 2)

    mlp_mse = mean_squared_error(test_y, mlp.predict(test_X))
    nn_mse = mean_squared_error(test_y, nn.predict(test_X))

    print(dataset, X.shape, np.unique(y), mlp_mse, nn_mse)

    results += [{
        "dataset": dataset,
        "nrows": X.shape[0],
        "ncols": X.shape[1],
        "mlp_mse": mlp_mse,
        "nn_mse": nn_mse,
        "mlp_time": mlp_time,
        "nn_time": nn_time,
    }]

    with open("results_regression.json", "w") as fout:
Ejemplo n.º 16
0
y = y.reshape(len(y), )
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)
#model = Sequential([
#Dense(input_shape=(x_train.shape[1],), units=1,kernel_initializer=tf.constant_initializer(1),
#          bias_initializer=tf.constant_initializer(0),kernel_constraint=MinMaxNorm(min_value=-1,max_value=1)),
#    tfpl.DistributionLambda(lambda t:tfd.Exponential(rate=t),
#                           convert_to_tensor_fn=tfd.Distribution.sample)
#])
#model.compile(loss=nll,
#optimizer=RMSprop(learning_rate=0.01))
#model.fit(x_train, y_train, epochs=1000, verbose=True);
#tr = RandomForestRegressor(n_estimators=50,max_depth=6,min_samples_split=10)
#clf=tr
#clf = AdaBoostClassifier(base_estimator=tr,n_estimators=80,random_state=50,learning_rate=1.0)
automl.fit(x_train, y_train)
print("MAP: ", mean_absolute_percentage_error(automl.predict(x_test), y_test))
dftt = pd.DataFrame()

dftt.loc[ind[0], cols[0]] = df[(df.doctype == 'SOW') & (df.qornot == 'Yes') &
                               (df.pages < sowmed)].tat.mean()
dftt.loc[ind[1], cols[0]] = df[(df.doctype == 'SOW') & (df.qornot == 'No') &
                               (df.pages < sowmed)].tat.mean()
dftt.loc[ind[2], cols[0]] = ttest(
    df[(df.doctype == 'SOW') & (df.qornot == 'Yes') & (df.pages < sowmed)].tat,
    df[(df.doctype == 'SOW') & (df.qornot == 'No') & (df.pages < sowmed)].tat)
dftt.loc[ind[0], cols[1]] = df[(df.doctype == 'SOW') & (df.qornot == 'Yes') &
                               (df.pages > sowmed)].tat.mean()
dftt.loc[ind[1], cols[1]] = df[(df.doctype == 'SOW') & (df.qornot == 'No') &
                               (df.pages > sowmed)].tat.mean()
dftt.loc[ind[2], cols[1]] = ttest(
    df[(df.doctype == 'SOW') & (df.qornot == 'Yes') & (df.pages < sowmed)].tat,
Ejemplo n.º 17
0
from sklearn.metrics import accuracy_score

df = pd.read_csv("tests/data/Titanic/train.csv")

X = df[df.columns[2:]]
y = df["Survived"]

automl = AutoML(
    results_path="examples/AutoML_Titanic",
    total_time_limit=60 * 60,
    train_ensemble=True,
)

automl.fit(X, y)

pred = automl.predict(X)

print("Train accuracy", accuracy_score(y, pred["label"]))

test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv")
test_cols = [
    "Parch",
    "Ticket",
    "Fare",
    "Pclass",
    "Name",
    "Sex",
    "Age",
    "SibSp",
    "Cabin",
    "Embarked",
Ejemplo n.º 18
0
import numpy as np
import pandas as pd
from supervised.automl import AutoML
from sklearn.metrics import accuracy_score
import os

X = np.random.rand(1000, 10)
X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)])
y = np.random.randint(0, 2, 1000)

automl = AutoML(total_time_limit=1000)
automl.fit(X, y)
print("Train accuracy", accuracy_score(y, automl.predict(X)["label"]))

X = np.random.rand(1000, 10)
X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)])
y = np.random.randint(0, 2, 1000)
print("Test accuracy", accuracy_score(y, automl.predict(X)["label"]))
Ejemplo n.º 19
0
import pandas as pd
from supervised.automl import AutoML
import os

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

df = pd.read_csv("tests/data/PortugeseBankMarketing/Data_FinalProject.csv")

X = df[df.columns[:-1]]
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.25)

automl = AutoML(
    # results_path="AutoML_22",
    total_time_limit=30 * 60,
    start_random_models=10,
    hill_climbing_steps=3,
    top_models_to_improve=3,
    train_ensemble=True,
)

automl.fit(X_train, y_train)

pred = automl.predict(X_test)
print("Test accuracy", accuracy_score(y_test, pred["label"]))
Ejemplo n.º 20
0
from supervised.automl import AutoML
import os

from sklearn.metrics import accuracy_score

df = pd.read_csv("tests/data/Titanic/train.csv")

X = df[df.columns[2:]]
y = df["Survived"]

automl = AutoML(results_path="examples/AutoML_Titanic",
                total_time_limit=60 * 60,
                start_random_models=10,
                hill_climbing_steps=3,
                top_models_to_improve=3,
                train_ensemble=True)

automl.fit(X, y)

pred = automl.predict(X)

print("Train accuracy", accuracy_score(y, pred["label"]))

test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv")
test_cols = [
    "Parch", "Ticket", "Fare", "Pclass", "Name", "Sex", "Age", "SibSp",
    "Cabin", "Embarked"
]
pred = automl.predict(df[test_cols])
print("Test accuracy", accuracy_score(test["Survived"], pred["label"]))