Exemple #1
0
class AutoML(AutoMLCore):

    name = "mlbox"

    def fit(self):
        """
        Performs the search
        """
        super().fit()
        self.train, self.test = train_test_split(self.dataset, test_size=0.1
        )
        self.data_dict = {
            "train": self.train,
            "test": self.test,
            "target": self.train[self.target]
        }
        if self.problem_type == "classification":
            optimization_scorer = "accuracy"
        else:
            self.optimization_scorer = "mean_squared_error"
        opt = Optimiser(scoring=optimization_scorer, n_folds=3)
        opt.evaluate(None, self.data_dict)
        space = {'ne__numerical_strategy': {"search": "choice",
                                            "space": [0,
                                                      "mean",
                                                      "median",
                                                      "most_frequent"]
                                            },
                 'ce__strategy': {"search": "choice",
                                  "space": ["label_encoding",
                                            "random_projection",
                                            "entity_embedding",
                                            "dummification"]},
                 'fs__threshold': {"search": "uniform",
                                   "space": [0.01, 0.3]},
                 'est__max_depth': {"search": "choice",
                                    "space": [3, 4, 5, 6, 7, 10, 15, 20]},
                 'est__n_estimators': {"search": "choice",
                                    "space": [50, 100, 200, 300, 400, 600, 800, 1000]},
                 }
        best = opt.optimise(space, self.data_dict, 15)
        self.automl = Predictor().fit_predict(best, self.data_dict)

    def evaluate(self):
        """
        Evaluates and stores performance
        """
        print("EVALUATING ESTIMATOR")
        train_preds = self.automl.predict(self.data_dict["train"])
        test_preds = self.automl.predict(self.data_dict["test"])
        train_score = evaluate(y_train, train_preds, problem_type)
        test_score = evaluate(y_test, test_preds, problem_type)
        self.metadata = {
            "metrics": {
                "test_score": test_score,
                "train_score": train_score
            },
            "experiment_settings": experiment_settings
        }
        pprint(self.metadata)
Exemple #2
0
def run_mlbox(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    target_name = params.target_name
    task = params.task

    config_data = get_models_hyperparameters()['MLBox']
    new_test_file_path, true_target = separate_target_column(
        test_file_path, target_name)
    paths = [train_file_path, new_test_file_path]

    data = Reader(sep=",").train_test_split(paths, target_name)
    data = Drift_thresholder().fit_transform(data)

    score = 'roc_auc' if task is TaskTypesEnum.classification else 'neg_mean_squared_error'

    opt = Optimiser(scoring=score, n_folds=5)
    params = opt.optimise(config_data['space'],
                          data,
                          max_evals=config_data['max_evals'])
    opt.evaluate(params, data)

    Predictor(verbose=False).fit_predict(params, data)

    cur_work_dir = os.path.abspath(os.curdir)

    predicted_df = pd.read_csv(
        os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv'))
    predicted = predicted_df['1.0']

    os.remove(new_test_file_path)

    return true_target, predicted
def run_mlbox(train_file_path: str, test_file_path: str, target_name: str,
              task: MachineLearningTasksEnum):
    config_data = get_models_hyperparameters()['MLBox']
    new_test_file_path, test_target = separate_target_column(
        test_file_path, target_name)
    paths = [train_file_path, new_test_file_path]

    data = Reader(sep=",").train_test_split(paths, target_name)
    data = Drift_thresholder().fit_transform(data)

    score = 'roc_auc' if task is MachineLearningTasksEnum.classification else 'neg_mean_squared_error'

    opt = Optimiser(scoring=score, n_folds=5)
    params = opt.optimise(config_data['space'],
                          data,
                          max_evals=config_data['max_evals'])
    opt.evaluate(params, data)

    Predictor(verbose=False).fit_predict(params, data)

    cur_work_dir = os.path.abspath(os.curdir)

    predicted_df = pd.read_csv(
        os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv'))
    predicted = predicted_df['1.0']
    metric = roc_auc_score(test_target, predicted)

    print(f'ROC_AUC: {metric}')

    os.remove(new_test_file_path)

    return metric
Exemple #4
0
def mlbox_counter():
    from mlbox.preprocessing import Reader, Drift_thresholder
    from mlbox.optimisation import Optimiser
    from mlbox.prediction import Predictor
    target_name = '601'

    rd = Reader(sep=",")
    df = rd.train_test_split(['train_egg.csv', 'test_egg.csv'], target_name)
    # print(df)
    dft = Drift_thresholder()
    df = dft.fit_transform(df)  # removing non-stable features (like ID,...)

    opt = Optimiser(scoring="accuracy", n_folds=10)
    space = {
        'est__strategy': {
            "search": "choice",
            "space": ["LightGBM"]
        },
        'est__n_estimators': {
            "search": "choice",
            "space": [150]
        },
        'est__colsample_bytree': {
            "search": "uniform",
            "space": [0.8, 0.95]
        },
        'est__subsample': {
            "search": "uniform",
            "space": [0.8, 0.95]
        },
        'est__max_depth': {
            "search": "choice",
            "space": [5, 6, 7, 8, 9]
        },
        'est__learning_rate': {
            "search": "choice",
            "space": [0.07]
        }
    }
    best = opt.optimise(space, df, 15)

    prd = Predictor()
    prd.fit_predict(best, df)
Exemple #5
0
 def fit(self):
     """
     Performs the search
     """
     super().fit()
     self.train, self.test = train_test_split(self.dataset, test_size=0.1
     )
     self.data_dict = {
         "train": self.train,
         "test": self.test,
         "target": self.train[self.target]
     }
     if self.problem_type == "classification":
         optimization_scorer = "accuracy"
     else:
         self.optimization_scorer = "mean_squared_error"
     opt = Optimiser(scoring=optimization_scorer, n_folds=3)
     opt.evaluate(None, self.data_dict)
     space = {'ne__numerical_strategy': {"search": "choice",
                                         "space": [0,
                                                   "mean",
                                                   "median",
                                                   "most_frequent"]
                                         },
              'ce__strategy': {"search": "choice",
                               "space": ["label_encoding",
                                         "random_projection",
                                         "entity_embedding",
                                         "dummification"]},
              'fs__threshold': {"search": "uniform",
                                "space": [0.01, 0.3]},
              'est__max_depth': {"search": "choice",
                                 "space": [3, 4, 5, 6, 7, 10, 15, 20]},
              'est__n_estimators': {"search": "choice",
                                 "space": [50, 100, 200, 300, 400, 600, 800, 1000]},
              }
     best = opt.optimise(space, self.data_dict, 15)
     self.automl = Predictor().fit_predict(best, self.data_dict)
#   {"label_encoding", "dummification", "random_projection", entity_embedding"}
space = {
    'ne__numerical_strategy': {
        "search": "choice",
        "space": [0]
    },
    'ce__strategy': {
        "search": "choice",
        "space": ["label_encoding", "random_projection", "entity_embedding"]
    },
    'fs__threshold': {
        "search": "uniform",
        "space": [0.01, 0.3]
    },
    'est__max_depth': {
        "search": "choice",
        "space": [3, 4, 5, 6, 7]
    }
}

# Optimises hyper-parameters of the whole Pipeline with a given scoring
# function. Algorithm used to optimize : Tree Parzen Estimator.
#
# IMPORTANT : Try to avoid dependent parameters and to set one feature
# selection strategy and one estimator strategy at a time.
best = opt.optimise(space, dict, 15)

# Make prediction and save the results in save folder.
prd = Predictor()
prd.fit_predict(best, dict)
Exemple #7
0
#   {"label_encoding", "dummification", "random_projection", entity_embedding"}
space = {
    'ne__numerical_strategy': {
        "search": "choice",
        "space": [0]
    },
    'ce__strategy': {
        "search": "choice",
        "space": ["label_encoding", "random_projection", "entity_embedding"]
    },
    'fs__threshold': {
        "search": "uniform",
        "space": [0.01, 0.3]
    },
    'est__max_depth': {
        "search": "choice",
        "space": [3, 4, 5, 6, 7]
    }
}

# Optimises hyper-parameters of the whole Pipeline with a given scoring
# function. Algorithm used to optimize : Tree Parzen Estimator.
#
# IMPORTANT : Try to avoid dependent parameters and to set one feature
# selection strategy and one estimator strategy at a time.
best = opt.optimise(space, data, 15)

# Make prediction and save the results in save folder.
prd = Predictor()
prd.fit_predict(best, data)
Exemple #8
0
def model_auto_mlbox( filepath= [ "train.csv", "test.csv" ],
    colX=None, coly=None,
    do="predict",
    outfolder="aaserialize/",
    model_type="regressor/classifier",
    params={ "csv_seprator" : ",", "train_size" : 0.5, "score_metric" : "accuracy",
             "n_folds": 3, "n_step": 10},
    param_space =  {
        'est__strategy':{"search":"choice",                         "space":["LightGBM"]},
        'est__n_estimators':{"search":"choice",                     "space":[150]},
        'est__colsample_bytree':{"search":"uniform",                "space":[0.8,0.95]},
        'est__subsample':{"search":"uniform",                       "space":[0.8,0.95]},
        'est__max_depth':{"search":"choice",                        "space":[5,6,7,8,9]},
        'est__learning_rate':{"search":"choice",                    "space":[0.07]}
    },
    generation=1,
    population_size=5,
    verbosity=2,
):
    """
      Using mlbox
      https://www.analyticsvidhya.com/blog/2017/07/mlbox-library-automated-machine-learning/


    Parameters
    ----------
    df : TYPE
        DESCRIPTION.
    colX : TYPE
        DESCRIPTION.
    coly : TYPE
        DESCRIPTION.
    outfolder : TYPE, optional
        DESCRIPTION. The default is "aaserialize/".
    model_type : TYPE, optional
        DESCRIPTION. The default is "regressor/classifier".
    params : TYPE, optional
        DESCRIPTION. The default is {"train_size" : 0.5}.
    generation : TYPE, optional
        DESCRIPTION. The default is 1.
    population_size : TYPE, optional
        DESCRIPTION. The default is 5.
    verbosity : TYPE, optional
        DESCRIPTION. The default is 2.

    Returns
    -------
    None.

    """
    from mlbox.preprocessing import Reader,Drift_thresholder
    from mlbox.optimisation import Optimiser
    from mlbox.prediction import Predictor

    p = dict2(params)


    ## Pre-process
    """
    df (dict, default = None) –
    Dataset dictionary. Must contain keys and values:

    ”train”: pandas DataFrame for the train set.
    ”test” : pandas DataFrame for the test set.
    ”target” : encoded pandas Serie for the target on train set (with dtype=’float’ for a regression or dtype=’int’ for a classification). Indexes should match the train set.

    """
    rd = Reader(sep = p.csv_separator)
    df = rd.train_test_split( filepath, coly)   # Reading and preprocessing (dates, ...)
    dft = Drift_thresholder()
    df = dft.fit_transform(df)      # Removing non-stable features (like ID,...)


    ### Optimal parameter
    # score_rmse = make_scorer(lambda y_true, y_pred: np.sqrt(np.sum((y_true - y_pred)**2)/len(y_true)), greater_is_better=False, needs_proba=False)
    #                    opt = Optimiser(scoring = rmse, n_folds = 3)

    opt = Optimiser(scoring = p.score_metric, n_folds = p.n_folds)
    param_optim = opt.optimise(param_space, df, p.n_step)


    if do == "prediction" :
      clf = Predictor(to_path= outfolder, verbose=True)

      #Fit and predict and save on disk
      clf.fit_predict(param_optim, df)

      # Load the predictions
      preds = pd.read_csv("save/"+coly+"_predictions.csv")
      print(preds.shape, preds.head(5))




      """