class AutoML(AutoMLCore): name = "mlbox" def fit(self): """ Performs the search """ super().fit() self.train, self.test = train_test_split(self.dataset, test_size=0.1 ) self.data_dict = { "train": self.train, "test": self.test, "target": self.train[self.target] } if self.problem_type == "classification": optimization_scorer = "accuracy" else: self.optimization_scorer = "mean_squared_error" opt = Optimiser(scoring=optimization_scorer, n_folds=3) opt.evaluate(None, self.data_dict) space = {'ne__numerical_strategy': {"search": "choice", "space": [0, "mean", "median", "most_frequent"] }, 'ce__strategy': {"search": "choice", "space": ["label_encoding", "random_projection", "entity_embedding", "dummification"]}, 'fs__threshold': {"search": "uniform", "space": [0.01, 0.3]}, 'est__max_depth': {"search": "choice", "space": [3, 4, 5, 6, 7, 10, 15, 20]}, 'est__n_estimators': {"search": "choice", "space": [50, 100, 200, 300, 400, 600, 800, 1000]}, } best = opt.optimise(space, self.data_dict, 15) self.automl = Predictor().fit_predict(best, self.data_dict) def evaluate(self): """ Evaluates and stores performance """ print("EVALUATING ESTIMATOR") train_preds = self.automl.predict(self.data_dict["train"]) test_preds = self.automl.predict(self.data_dict["test"]) train_score = evaluate(y_train, train_preds, problem_type) test_score = evaluate(y_test, test_preds, problem_type) self.metadata = { "metrics": { "test_score": test_score, "train_score": train_score }, "experiment_settings": experiment_settings } pprint(self.metadata)
def run_mlbox(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file target_name = params.target_name task = params.task config_data = get_models_hyperparameters()['MLBox'] new_test_file_path, true_target = separate_target_column( test_file_path, target_name) paths = [train_file_path, new_test_file_path] data = Reader(sep=",").train_test_split(paths, target_name) data = Drift_thresholder().fit_transform(data) score = 'roc_auc' if task is TaskTypesEnum.classification else 'neg_mean_squared_error' opt = Optimiser(scoring=score, n_folds=5) params = opt.optimise(config_data['space'], data, max_evals=config_data['max_evals']) opt.evaluate(params, data) Predictor(verbose=False).fit_predict(params, data) cur_work_dir = os.path.abspath(os.curdir) predicted_df = pd.read_csv( os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv')) predicted = predicted_df['1.0'] os.remove(new_test_file_path) return true_target, predicted
def run_mlbox(train_file_path: str, test_file_path: str, target_name: str, task: MachineLearningTasksEnum): config_data = get_models_hyperparameters()['MLBox'] new_test_file_path, test_target = separate_target_column( test_file_path, target_name) paths = [train_file_path, new_test_file_path] data = Reader(sep=",").train_test_split(paths, target_name) data = Drift_thresholder().fit_transform(data) score = 'roc_auc' if task is MachineLearningTasksEnum.classification else 'neg_mean_squared_error' opt = Optimiser(scoring=score, n_folds=5) params = opt.optimise(config_data['space'], data, max_evals=config_data['max_evals']) opt.evaluate(params, data) Predictor(verbose=False).fit_predict(params, data) cur_work_dir = os.path.abspath(os.curdir) predicted_df = pd.read_csv( os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv')) predicted = predicted_df['1.0'] metric = roc_auc_score(test_target, predicted) print(f'ROC_AUC: {metric}') os.remove(new_test_file_path) return metric
def mlbox_counter(): from mlbox.preprocessing import Reader, Drift_thresholder from mlbox.optimisation import Optimiser from mlbox.prediction import Predictor target_name = '601' rd = Reader(sep=",") df = rd.train_test_split(['train_egg.csv', 'test_egg.csv'], target_name) # print(df) dft = Drift_thresholder() df = dft.fit_transform(df) # removing non-stable features (like ID,...) opt = Optimiser(scoring="accuracy", n_folds=10) space = { 'est__strategy': { "search": "choice", "space": ["LightGBM"] }, 'est__n_estimators': { "search": "choice", "space": [150] }, 'est__colsample_bytree': { "search": "uniform", "space": [0.8, 0.95] }, 'est__subsample': { "search": "uniform", "space": [0.8, 0.95] }, 'est__max_depth': { "search": "choice", "space": [5, 6, 7, 8, 9] }, 'est__learning_rate': { "search": "choice", "space": [0.07] } } best = opt.optimise(space, df, 15) prd = Predictor() prd.fit_predict(best, df)
def fit(self): """ Performs the search """ super().fit() self.train, self.test = train_test_split(self.dataset, test_size=0.1 ) self.data_dict = { "train": self.train, "test": self.test, "target": self.train[self.target] } if self.problem_type == "classification": optimization_scorer = "accuracy" else: self.optimization_scorer = "mean_squared_error" opt = Optimiser(scoring=optimization_scorer, n_folds=3) opt.evaluate(None, self.data_dict) space = {'ne__numerical_strategy': {"search": "choice", "space": [0, "mean", "median", "most_frequent"] }, 'ce__strategy': {"search": "choice", "space": ["label_encoding", "random_projection", "entity_embedding", "dummification"]}, 'fs__threshold': {"search": "uniform", "space": [0.01, 0.3]}, 'est__max_depth': {"search": "choice", "space": [3, 4, 5, 6, 7, 10, 15, 20]}, 'est__n_estimators': {"search": "choice", "space": [50, 100, 200, 300, 400, 600, 800, 1000]}, } best = opt.optimise(space, self.data_dict, 15) self.automl = Predictor().fit_predict(best, self.data_dict)
# {"label_encoding", "dummification", "random_projection", entity_embedding"} space = { 'ne__numerical_strategy': { "search": "choice", "space": [0] }, 'ce__strategy': { "search": "choice", "space": ["label_encoding", "random_projection", "entity_embedding"] }, 'fs__threshold': { "search": "uniform", "space": [0.01, 0.3] }, 'est__max_depth': { "search": "choice", "space": [3, 4, 5, 6, 7] } } # Optimises hyper-parameters of the whole Pipeline with a given scoring # function. Algorithm used to optimize : Tree Parzen Estimator. # # IMPORTANT : Try to avoid dependent parameters and to set one feature # selection strategy and one estimator strategy at a time. best = opt.optimise(space, dict, 15) # Make prediction and save the results in save folder. prd = Predictor() prd.fit_predict(best, dict)
# {"label_encoding", "dummification", "random_projection", entity_embedding"} space = { 'ne__numerical_strategy': { "search": "choice", "space": [0] }, 'ce__strategy': { "search": "choice", "space": ["label_encoding", "random_projection", "entity_embedding"] }, 'fs__threshold': { "search": "uniform", "space": [0.01, 0.3] }, 'est__max_depth': { "search": "choice", "space": [3, 4, 5, 6, 7] } } # Optimises hyper-parameters of the whole Pipeline with a given scoring # function. Algorithm used to optimize : Tree Parzen Estimator. # # IMPORTANT : Try to avoid dependent parameters and to set one feature # selection strategy and one estimator strategy at a time. best = opt.optimise(space, data, 15) # Make prediction and save the results in save folder. prd = Predictor() prd.fit_predict(best, data)
def model_auto_mlbox( filepath= [ "train.csv", "test.csv" ], colX=None, coly=None, do="predict", outfolder="aaserialize/", model_type="regressor/classifier", params={ "csv_seprator" : ",", "train_size" : 0.5, "score_metric" : "accuracy", "n_folds": 3, "n_step": 10}, param_space = { 'est__strategy':{"search":"choice", "space":["LightGBM"]}, 'est__n_estimators':{"search":"choice", "space":[150]}, 'est__colsample_bytree':{"search":"uniform", "space":[0.8,0.95]}, 'est__subsample':{"search":"uniform", "space":[0.8,0.95]}, 'est__max_depth':{"search":"choice", "space":[5,6,7,8,9]}, 'est__learning_rate':{"search":"choice", "space":[0.07]} }, generation=1, population_size=5, verbosity=2, ): """ Using mlbox https://www.analyticsvidhya.com/blog/2017/07/mlbox-library-automated-machine-learning/ Parameters ---------- df : TYPE DESCRIPTION. colX : TYPE DESCRIPTION. coly : TYPE DESCRIPTION. outfolder : TYPE, optional DESCRIPTION. The default is "aaserialize/". model_type : TYPE, optional DESCRIPTION. The default is "regressor/classifier". params : TYPE, optional DESCRIPTION. The default is {"train_size" : 0.5}. generation : TYPE, optional DESCRIPTION. The default is 1. population_size : TYPE, optional DESCRIPTION. The default is 5. verbosity : TYPE, optional DESCRIPTION. The default is 2. Returns ------- None. """ from mlbox.preprocessing import Reader,Drift_thresholder from mlbox.optimisation import Optimiser from mlbox.prediction import Predictor p = dict2(params) ## Pre-process """ df (dict, default = None) – Dataset dictionary. Must contain keys and values: ”train”: pandas DataFrame for the train set. ”test” : pandas DataFrame for the test set. ”target” : encoded pandas Serie for the target on train set (with dtype=’float’ for a regression or dtype=’int’ for a classification). Indexes should match the train set. """ rd = Reader(sep = p.csv_separator) df = rd.train_test_split( filepath, coly) # Reading and preprocessing (dates, ...) dft = Drift_thresholder() df = dft.fit_transform(df) # Removing non-stable features (like ID,...) ### Optimal parameter # score_rmse = make_scorer(lambda y_true, y_pred: np.sqrt(np.sum((y_true - y_pred)**2)/len(y_true)), greater_is_better=False, needs_proba=False) # opt = Optimiser(scoring = rmse, n_folds = 3) opt = Optimiser(scoring = p.score_metric, n_folds = p.n_folds) param_optim = opt.optimise(param_space, df, p.n_step) if do == "prediction" : clf = Predictor(to_path= outfolder, verbose=True) #Fit and predict and save on disk clf.fit_predict(param_optim, df) # Load the predictions preds = pd.read_csv("save/"+coly+"_predictions.csv") print(preds.shape, preds.head(5)) """