def test_fit_and_predict(self): metric = Metric({"name": "logloss"}) automl = AutoML( total_time_limit=5, algorithms=["Xgboost"], start_random_models=5, hill_climbing_steps=0, seed=13, ) automl.fit(self.X, self.y) y_predicted = automl.predict(self.X)["p_1"] self.assertTrue(y_predicted is not None) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.7) params = automl.to_json() automl2 = AutoML() automl2.from_json(params) y_predicted2 = automl2.predict(self.X)["p_1"] self.assertTrue(y_predicted2 is not None) loss2 = metric(self.y, y_predicted2) self.assertTrue(loss2 < 0.7) assert_almost_equal(automl._threshold, automl2._threshold)
def run(dataset, config): log.info("\n**** mljar-supervised ****\n") column_names, _ = zip(*dataset.columns) column_types = dict(dataset.columns) X_train = pd.DataFrame(dataset.train.X, columns=column_names).astype(column_types, copy=False) X_test = pd.DataFrame(dataset.test.X, columns=column_names).astype(column_types, copy=False) y_train = dataset.train.y.flatten() y_test = dataset.test.y.flatten() problem_mapping = dict( binary="binary_classification", multiclass="multiclass_classification", regression="regression", ) is_classification = config.type == "classification" ml_task = problem_mapping.get( dataset.problem_type ) # if None the AutoML will guess about the ML task results_path = output_subdir("results", config) training_params = { k: v for k, v in config.framework_params.items() if not k.startswith("_") } automl = AutoML(results_path=results_path, total_time_limit=config.max_runtime_seconds, seed=config.seed, ml_task=ml_task, **training_params) with Timer() as training: automl.fit(X_train, y_train) preds = automl.predict(X_test) predictions, probabilities = None, None if is_classification: predictions = preds["label"].values probabilities = preds[preds.columns[:-1]].values else: predictions = preds["prediction"].values # clean the results if not config.framework_params.get("_save_artifacts", False): shutil.rmtree(results_path, ignore_errors=True) return result( output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, models_count=len(automl._models), training_duration=training.duration, )
def test_fit_and_predict(self): seed = 1709 df = pd.read_csv( "./tests/data/housing_regression_missing_values_missing_target.csv" ) print(df.columns) x_cols = [c for c in df.columns if c != "MEDV"] X = df[x_cols] y = df["MEDV"] X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.3, random_state=seed) automl = AutoML( total_time_limit=10, algorithms=["Xgboost" ], # ["LightGBM", "RF", "NN", "CatBoost", "Xgboost"], start_random_models=1, hill_climbing_steps=0, top_models_to_improve=0, train_ensemble=True, verbose=True, ) automl.fit(X_train, y_train) response = automl.predict(X_test) # ["p_1"] print("Response", response)
def test_fit_and_predict(self): seed = 1706 + 1 for dataset_id in [31]: # 720 # 31,44,737 df = pd.read_csv("./tests/data/data/{0}.csv".format(dataset_id)) x_cols = [c for c in df.columns if c != "target"] X = df[x_cols] y = df["target"] X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.3, random_state=seed) automl = AutoML( total_time_limit=60 * 6000, algorithms=["LightGBM", "RF", "NN", "CatBoost", "Xgboost"], start_random_models=10, hill_climbing_steps=3, top_models_to_improve=3, train_ensemble=True, verbose=True, ) automl.fit(X_train, y_train) response = automl.predict(X_test) # Compute the logloss on test dataset ll = log_loss(y_test, response) print("(*) Dataset id {} logloss {}".format(dataset_id, ll)) for i, m in enumerate(automl._models): response = m.predict(X_test) ll = log_loss(y_test, response) print("{}) Dataset id {} logloss {}".format(i, dataset_id, ll))
def test_fit_and_predict(self): for dataset_id in [3, 24, 31, 38, 44, 179, 737, 720]: df = pd.read_csv("./tests/data/{0}.csv".format(dataset_id)) x_cols = [c for c in df.columns if c != "target"] X = df[x_cols] y = df["target"] for repeat in range(1): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, test_size=0.3, random_state=1706 + repeat) automl = AutoML( total_time_limit=60 * 1, # 1h limit algorithms=[ "Xgboost" ], # ["LightGBM", "CatBoost", "Xgboost", "RF", "NN"], start_random_models=3, hill_climbing_steps=1, top_models_to_improve=1, train_ensemble=True, verbose=True, ) automl.fit(X_train, y_train) response = automl.predict(X_test)["p_1"] labels = automl.predict(X_test)["label"] # Compute the logloss on test dataset ll = log_loss(y_test, response) f1 = f1_score(y_test, labels) print("iter: {}) id:{} logloss:{} f1:{} time:{}".format( repeat, dataset_id, ll, f1, automl._fit_time)) with open("./result.txt", "a") as f_result: f_result.write("{} {} {} {} {}\n".format( repeat, dataset_id, ll, f1, automl._fit_time))
def test_predict_labels(self): # 3.csv') # df = pd.read_csv( 'tests/data/adult_missing_values_missing_target_500rows.csv') X = df[df.columns[:-1]] y = df[df.columns[-1]] automl = AutoML(total_time_limit=15, algorithms=["Xgboost"], start_random_models=5, hill_climbing_steps=0, train_ensemble=True) automl.fit(X, y) y_predicted = automl.predict(X) self.assertTrue('A' in np.unique(y_predicted['label'])) self.assertTrue('B' in np.unique(y_predicted['label']))
def train_titanic(train_data): train_df = pd.read_csv(train_data) # test_df = pd.read_csv(test_data) # feature_cols = train_df.drop(['Survived', 'PassengerId', 'Name'], axis=1).columns feature_cols = train_df.columns[2:] target_cols = 'Survived' X_train, X_test, y_train, y_test = train_test_split(train_df[feature_cols], train_df[target_cols], test_size=0.25) automl = AutoML(results_path="AutoML_titanic") automl.fit(X_train, y_train) predictions = automl.predict(X_test) print(f"Accuracy: {accuracy_score(y_test, predictions) * 100.0:.2f}%")
def test_reproduce_fit(self): metric = Metric({"name": "logloss"}) losses = [] for i in range(2): automl = AutoML( total_time_limit= 10000, # the time limit should be big enough too not interrupt the training algorithms=["Xgboost"], start_random_models=2, hill_climbing_steps=1, train_ensemble=True, verbose=True, seed=12, ) automl.fit(self.X, self.y) y_predicted = automl.predict(self.X)["p_1"] loss = metric(self.y, y_predicted) losses += [loss] assert_almost_equal(losses[0], losses[1], decimal=4)
import pandas as pd # scikit learn utilites from sklearn.datasets import load_digits from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split # mljar-supervised package from supervised.automl import AutoML # Load the data digits = load_digits() X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25) # train models automl = AutoML(mode="Perform") automl.fit(X_train, y_train) # compute the accuracy on test data predictions = automl.predict(X_test) print(predictions.head()) print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))
automl = AutoML(mode="Explain") automl.fit(X, y) pred = automl.predict(X) print("Train accuracy", accuracy_score(y, pred)) test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv") pred = automl.predict(test) print("Test accuracy", accuracy_score(test["Survived"], pred)) ''' import pandas as pd import numpy as np from sklearn.metrics import accuracy_score from supervised import AutoML train = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv") print(train.head()) X = train[train.columns[2:]] y = train["Survived"] #automl = AutoML(mode="Compete") # default mode is Explain automl = AutoML(algorithms=["Decision Tree"]) # default mode is Explain automl.fit(X, y) test = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv") predictions = automl.predict(test) print(predictions) print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%" )
category=pd.core.common.SettingWithCopyWarning) # message="*ndarray*") # df = pd.read_csv("tests/data/iris_classes_missing_values_missing_target.csv") df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv") X = df[["feature_1", "feature_2", "feature_3", "feature_4"]] y = df["class"] automl = AutoML( # results_path="AutoML_41", # algorithms=["CatBoost"], #algorithms=["Neural Network"], # "Linear", # "Xgboost", # "Random Forest" # ], #total_time_limit=100, #tuning_mode="Normal", #explain_level=0, mode="Perform") # automl.set_advanced(start_random_models=1) automl.fit(X, y) predictions = automl.predict(X) print(predictions.head()) print(predictions.tail()) print(X.shape) print(predictions.shape)
if (len(ratedf) == 0): continue featvals_stoch.append("stoch20" + "(" + str(0.1 * (i + 1)) + ")") ratedf = k1[(k1.stoch20 > i * 0.1) & (k1.stoch20 < (i + 1) * 0.1)] featrate_stoch.append(len(ratedf[ratedf.rets_long == 1]) / (len(ratedf))) k1 = k1.dropna() feats = [ 'close_diff', 'gap1', 'rsi5', 'rsi5_smoothed', 'gap', 'stoch20', 'stoch14', 'rsi14', 'rsi20', 'sine', 'bandpass', 'cci', 'decycle', 'quadlead', 'velacc', 'VIX_Close', 'VIX_Close_diff', 'h', 'rsi20_diff', 'rsi14_diff', 'stoch20_diff', 'res1', 'res2', 'res3', 'res4', 'res5' ] feats1 = feats xtrain = np.array(k1[feats1]) ytrain = np.array(k1.rets_long) tr = RandomForestClassifier(n_estimators=550, max_depth=6, min_samples_split=10) clf = tr #clf = AdaBoostClassifier(base_estimator=tr,n_estimators=80,random_state=50,learning_rate=1.0) automl.fit(xtrain, ytrain) k2['predictions'] = automl.predict(np.array(k2[feats1])) k2['rand_preds'] = [random.choice([0, 1]) for _ in range(len(k2))] print("accs:") print(len(k2[k2.predictions == k2.rets1]) / (len(k2))) print("rets original:") print(k2.rets.sum()) print("rets model") print((k2.predictions * k2.rets).sum())
df = pd.read_csv( "./tests/data/housing_regression_missing_values_missing_target.csv") x_cols = [c for c in df.columns if c != "MEDV"] X = df[x_cols] y = df["MEDV"] print("y", y[:10]) print(X.shape) automl = AutoML( #results_path="AutoML_43", #total_time_limit=100, #algorithms=["Linear"], # "Decision Tree", # , # "Extra Trees" #], #explain_level=0, #tuning_mode="Normal" mode='Explain', #train_ensemble = True ) #automl.set_advanced(start_random_models=1) automl.fit(X, y) df["predictions"] = automl.predict(X) print("Predictions") print(df[["MEDV", "predictions"]].head())
import pandas as pd import numpy as np from supervised.automl import AutoML import os from sklearn.metrics import accuracy_score df = pd.read_csv("tests/data/Titanic/train.csv") X = df[df.columns[2:]] y = df["Survived"] automl = AutoML(mode="Explain") automl.fit(X, y) pred = automl.predict(X) print("Train accuracy", accuracy_score(y, pred)) test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv") pred = automl.predict(test) print("Test accuracy", accuracy_score(test["Survived"], pred))
nn = AutoML( algorithms=["Neural Network"], mode="Perform", explain_level=0, train_ensemble=False, golden_features=False, features_selection=False, ml_task="regression", ) mlp.fit(train_X, train_y) mlp_time = np.round(time.time() - mlp._start_time, 2) nn.fit(train_X, train_y) nn_time = np.round(time.time() - nn._start_time, 2) mlp_mse = mean_squared_error(test_y, mlp.predict(test_X)) nn_mse = mean_squared_error(test_y, nn.predict(test_X)) print(dataset, X.shape, np.unique(y), mlp_mse, nn_mse) results += [{ "dataset": dataset, "nrows": X.shape[0], "ncols": X.shape[1], "mlp_mse": mlp_mse, "nn_mse": nn_mse, "mlp_time": mlp_time, "nn_time": nn_time, }] with open("results_regression.json", "w") as fout:
y = y.reshape(len(y), ) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3) #model = Sequential([ #Dense(input_shape=(x_train.shape[1],), units=1,kernel_initializer=tf.constant_initializer(1), # bias_initializer=tf.constant_initializer(0),kernel_constraint=MinMaxNorm(min_value=-1,max_value=1)), # tfpl.DistributionLambda(lambda t:tfd.Exponential(rate=t), # convert_to_tensor_fn=tfd.Distribution.sample) #]) #model.compile(loss=nll, #optimizer=RMSprop(learning_rate=0.01)) #model.fit(x_train, y_train, epochs=1000, verbose=True); #tr = RandomForestRegressor(n_estimators=50,max_depth=6,min_samples_split=10) #clf=tr #clf = AdaBoostClassifier(base_estimator=tr,n_estimators=80,random_state=50,learning_rate=1.0) automl.fit(x_train, y_train) print("MAP: ", mean_absolute_percentage_error(automl.predict(x_test), y_test)) dftt = pd.DataFrame() dftt.loc[ind[0], cols[0]] = df[(df.doctype == 'SOW') & (df.qornot == 'Yes') & (df.pages < sowmed)].tat.mean() dftt.loc[ind[1], cols[0]] = df[(df.doctype == 'SOW') & (df.qornot == 'No') & (df.pages < sowmed)].tat.mean() dftt.loc[ind[2], cols[0]] = ttest( df[(df.doctype == 'SOW') & (df.qornot == 'Yes') & (df.pages < sowmed)].tat, df[(df.doctype == 'SOW') & (df.qornot == 'No') & (df.pages < sowmed)].tat) dftt.loc[ind[0], cols[1]] = df[(df.doctype == 'SOW') & (df.qornot == 'Yes') & (df.pages > sowmed)].tat.mean() dftt.loc[ind[1], cols[1]] = df[(df.doctype == 'SOW') & (df.qornot == 'No') & (df.pages > sowmed)].tat.mean() dftt.loc[ind[2], cols[1]] = ttest( df[(df.doctype == 'SOW') & (df.qornot == 'Yes') & (df.pages < sowmed)].tat,
from sklearn.metrics import accuracy_score df = pd.read_csv("tests/data/Titanic/train.csv") X = df[df.columns[2:]] y = df["Survived"] automl = AutoML( results_path="examples/AutoML_Titanic", total_time_limit=60 * 60, train_ensemble=True, ) automl.fit(X, y) pred = automl.predict(X) print("Train accuracy", accuracy_score(y, pred["label"])) test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv") test_cols = [ "Parch", "Ticket", "Fare", "Pclass", "Name", "Sex", "Age", "SibSp", "Cabin", "Embarked",
import numpy as np import pandas as pd from supervised.automl import AutoML from sklearn.metrics import accuracy_score import os X = np.random.rand(1000, 10) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)]) y = np.random.randint(0, 2, 1000) automl = AutoML(total_time_limit=1000) automl.fit(X, y) print("Train accuracy", accuracy_score(y, automl.predict(X)["label"])) X = np.random.rand(1000, 10) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)]) y = np.random.randint(0, 2, 1000) print("Test accuracy", accuracy_score(y, automl.predict(X)["label"]))
import pandas as pd from supervised.automl import AutoML import os from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split df = pd.read_csv("tests/data/PortugeseBankMarketing/Data_FinalProject.csv") X = df[df.columns[:-1]] y = df["y"] X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25) automl = AutoML( # results_path="AutoML_22", total_time_limit=30 * 60, start_random_models=10, hill_climbing_steps=3, top_models_to_improve=3, train_ensemble=True, ) automl.fit(X_train, y_train) pred = automl.predict(X_test) print("Test accuracy", accuracy_score(y_test, pred["label"]))
from supervised.automl import AutoML import os from sklearn.metrics import accuracy_score df = pd.read_csv("tests/data/Titanic/train.csv") X = df[df.columns[2:]] y = df["Survived"] automl = AutoML(results_path="examples/AutoML_Titanic", total_time_limit=60 * 60, start_random_models=10, hill_climbing_steps=3, top_models_to_improve=3, train_ensemble=True) automl.fit(X, y) pred = automl.predict(X) print("Train accuracy", accuracy_score(y, pred["label"])) test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv") test_cols = [ "Parch", "Ticket", "Fare", "Pclass", "Name", "Sex", "Age", "SibSp", "Cabin", "Embarked" ] pred = automl.predict(df[test_cols]) print("Test accuracy", accuracy_score(test["Survived"], pred["label"]))