def test_regression_missing_target(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.rand(self.rows), name="target") y.iloc[1] = None automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) self.assertIsInstance(pred, np.ndarray) self.assertEqual(len(pred), X.shape[0])
def test_multi_class_0123(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 4, self.rows * 4) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(0 in u or 1 in u or 2 in u or 3 in u) self.assertTrue(len(u) <= 4)
def test_multi_class_abcd_mixed_int(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.permutation([1, "B", "CC", "d"] * self.rows), name="target") automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) self.assertTrue(len(u) <= 4)
def test_bin_class_01(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, ) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) for col in ["prediction_0", "prediction_1", "label"]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue(0 in u or 1 in u) self.assertTrue(len(u) <= 2)
def test_integration(self): a = AutoML(results_path=self.automl_dir, model_time_limit=1) a.set_advanced(start_random_models=1) X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) p = a.predict(X) self.assertTrue("label" in p.columns)
def test_multi_class_abcd(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.permutation(['a', 'B', 'CC', 'd'] * self.rows), name='target') automl = AutoML(results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False) automl.set_advanced(start_random_models=1) automl.fit(X, y) pred = automl.predict(X) for col in [ "prediction_a", "prediction_B", "prediction_CC", "prediction_d", "label" ]: self.assertTrue(col in pred.columns.tolist()) u = np.unique(pred["label"].values) self.assertTrue(np.intersect1d(u, ['a', 'B', 'CC', 'd']).shape[0] > 0) self.assertTrue(len(u) <= 4)
class JarAutoML(AutoMachineLearning): def __init__(self, n_folds_validation: int, shuffle_data: bool, max_rand: int) -> None: super().__init__(n_folds_validation, shuffle_data, max_rand) # initialize _clf as AutoMl type self.estimator = AutoML(mode="Compete", explain_level=0, random_state=self._random_state, validation_strategy={ "validation_type": "kfold", "k_folds": self._n_folds_validation, "shuffle": self._shuffle_data }) # abstract class method implementation def fit_model(self, x_train: DataFrame, y_train: NpArray) -> None: # clf fit method self.estimator.fit(x_train, y_train) # abstract class method implementation def predict_model(self, x_test: DataFrame) -> tuple: # clf predict. Returns prediction as tuple prediction_tuple = tuple(self.estimator.predict(x_test)) return prediction_tuple
def test_integration(self): a = AutoML( results_path=self.automl_dir, total_time_limit=1, explain_level=0, start_random_models=1, ) X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) a.fit(X, y) p = a.predict(X) self.assertIsInstance(p, np.ndarray) self.assertEqual(len(p), X.shape[0])
import pandas as pd import numpy as np from sklearn.metrics import accuracy_score from supervised import AutoML train = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv" ) X = train[train.columns[2:]] y = train["Survived"] automl = AutoML(results_path="AutoML_3") automl.fit(X, y) test = pd.read_csv( "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv" ) predictions = automl.predict(test) print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%")
from db import get_live_data from db import insert_predictions from supervised import AutoML X_live, ids = get_live_data() if X_live is None or not X_live.shape[0]: print("No new data") else: print("Compute predictions") automl = AutoML(results_path="Response_Classifier") predictions = automl.predict(X_live) print("Insert prediction into DB") insert_predictions(predictions, ids)
from sheets import get_train_data,get_client,write_out from supervised import AutoML import pandas as pd from sklearn.model_selection import train_test_split # get the training data df_name = "sheet name" cred_path = "path/credentials.json" email = "*****@*****.**" client = get_client(cred_path) X_train, y_train = get_train_data(client,df_name) # train AutoML X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,test_size=0.1) automl = AutoML(results_path="Automl_output",total_time_limit=10) #automl.fit(X_train, y_train) train_pred = automl.predict(X_train) test_pred = automl.predict(X_test) data = {'train_target':y_train,"train_prediction":train_pred, 'test_target':y_test,'test_prediction':test_pred} write_out(client,data,email)
import pandas as pd from supervised import AutoML train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") sub = pd.read_csv("sample_submission.csv") x_cols = train.columns[2:] print(x_cols) automl = AutoML(results_path="AutoML_3", mode="Compete", total_time_limit=4 * 3600, eval_metric="r2") automl.fit(train[x_cols], train["y"]) sub[sub.columns[1:]] = automl.predict(test) sub.to_csv("sub_1.csv", index=False)
import numpy as np import pandas as pd from supervised import AutoML train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") sub = pd.read_csv("sample_submission.csv") x_cols = train.columns[2:] print(x_cols) print(train.columns) print(train["target"].min()) print(train["target"].max()) automl = AutoML(mode="Compete", eval_metric="rmse", total_time_limit=4 * 3600) automl.fit(train[x_cols], np.log(train["target"])) sub[sub.columns[1]] = np.exp(automl.predict(test)) sub.to_csv("sub_1.csv", index=False)