Esempio n. 1
0
    def params(self):
        """
        retrieves the estimated algorithm's parameters if the algo is supported
        else, return KeyError

        :return: dictionary
        """
        if self.algo not in config("supported_algos"):
            raise KeyError(
                f'''{self.algo} not currently supported by this package''')
        return config(self.algo)
Esempio n. 2
0
    def _fetch_algo_metadata(algo):
        """
        retrieves algo name, algo params and meta params from sklearn model

        :param algo: sklearn model
        :return: dictionary
        :rtype: dict
        """
        algo_name = type(algo).__name__
        algo_params = algo.get_params()
        params = config(algo_name)

        param_dic = {'name': algo_name,
                     'params': algo_params, 'config': params}

        return param_dic
Esempio n. 3
0
    def _random_search(self, inputs, outputs, iterations, save_model=False):
        """
        performs a random search on the NN meta algo to find the best params

        :param inputs: pd.DataFrame chosen as input
        :param outputs: pd.DataFrame chosen as output
        :param iterations: Number of parameter settings that are sampled
        :param save_model: boolean set to True if the model needs to be saved
        :return: best meta_algo with parameters
        :rtype: scikit learn RandomizedSearchCV object
        """
        X, y, cols, original_cols = self._transform_data(inputs, outputs)

        if self.meta_algo != 'NN':
            raise KeyError(
                f'''meta algo {self.meta_algo} not supported for random search'''
            )

        parameter_space = config("random_search_params")
        meta_algo = MLPRegressor(max_iter=200)

        X_train, X_test, y_train, y_test \
            = train_test_split(X, y, test_size=0.20, random_state=42)

        X_train, X_test = self._scale_data(X_train, X_test, save_model)

        meta_algo = RandomizedSearchCV(meta_algo,
                                       parameter_space,
                                       n_iter=iterations,
                                       n_jobs=2)
        meta_algo.fit(X_train, y_train)

        if self.verbose >= 2:
            self.logger.info(
                f'''Best parameters found: {meta_algo.best_estimator_}''')

        return meta_algo
Esempio n. 4
0
    def model_fit(self,
                  generate_data=True,
                  inputs=None,
                  outputs=None,
                  csv_name=None,
                  save_model=False,
                  meta_algo_params=None,
                  compress=3):
        """
        builds the actual training time estimator
        (currently we only support NN or RF)
        the data is either generated from scratch or taken as input
        if specified, the meta algo is saved as a pkl file along
        with associated metadata (column names, mse per bin)

        :param generate_data: bool (if set to True, calls _generate_data)
        :param inputs: pd.DataFrame chosen as input
        :param outputs: pd.DataFrame chosen as output
        :param csv_name: name if csv in case we fetch data from csv
        :param save_model: boolean set to True if the model needs to be saved
        :param meta_algo_params: params of the meta algo
        :param compress: value between 1 and 9 to compress the pkl model (the higher the more compressed)
        :return: meta_algo
        :rtype: scikit learn model
        """
        if meta_algo_params is None:
            if self.meta_algo == 'NN':
                meta_algo_params = \
                    {'max_iter': 200, 'hidden_layer_sizes': [100, 100, 100]}

            elif self.meta_algo == 'RF':
                meta_algo_params = \
                    {'criterion': 'mse', 'max_depth': 50, 'max_features': 10}

        if generate_data:
            inputs, outputs, _ = self._generate_data()
        else:
            if csv_name is not None:
                inputs, outputs = self._transform_from_csv(csv_name=csv_name)

        if inputs is None or outputs is None:
            raise NameError(
                '''no inputs / outputs found: please enter a csv name or set generate_data to True'''
            )

        X, y, cols, original_cols = self._transform_data(inputs, outputs)

        # we decide on a meta-algorithm
        if self.meta_algo not in config('supported_meta_algos'):
            raise KeyError(
                f'''meta algo {self.meta_algo} currently not supported''')

        if self.meta_algo == 'RF':
            meta_algo = RandomForestRegressor(**meta_algo_params)
        if self.meta_algo == 'NN':
            meta_algo = MLPRegressor(**meta_algo_params)

        if self.verbose >= 2:
            self.logger.info(
                f'''Fitting {self.meta_algo} to estimate training durations for model {self.algo}'''
            )

        # dividing into train/test
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=0.20, random_state=42)

        if self.meta_algo == 'NN':
            X_train_scaled, X_test_scaled = \
                self._scale_data(X_train, X_test, save_model)

            meta_algo.fit(X_train_scaled, y_train)

        else:
            meta_algo.fit(X_train, y_train)

        if save_model:
            if self.verbose >= 2:
                self.logger.info(
                    f'''Saving {self.meta_algo} to {self.meta_algo}_{self.algo}_estimator.pkl'''
                )

            model_path = f'''{get_path("models")}/{self.meta_algo}_{self.algo}_estimator.pkl'''
            json_path = f'''{get_path("models")}/{self.meta_algo}_{self.algo}_estimator.json'''

            joblib.dump(meta_algo, model_path, compress=compress)

            with open(json_path, 'w') as outfile:
                json.dump(
                    {
                        "dummy": list(cols),
                        "original": list(original_cols)
                    }, outfile)

        if self.meta_algo == 'NN':
            if self.verbose >= 2:
                self.logger.info(
                    f'''R squared on train set is {r2_score(y_train, meta_algo.predict(X_train_scaled))}'''
                )

            # MAPE is the mean absolute percentage error
            test_relu = [max(i, 0) for i in meta_algo.predict(X_test_scaled)]
            train_relu = [max(i, 0) for i in meta_algo.predict(X_train_scaled)]
            y_pred_test = np.array(test_relu)
            y_pred_train = np.array(train_relu)

        else:
            if self.verbose >= 2:
                self.logger.info(
                    f'''R squared on train set is {r2_score(y_train, meta_algo.predict(X_train))}'''
                )

            y_pred_test = meta_algo.predict(X_test)
            y_pred_train = meta_algo.predict(X_train)

        mape_test = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
        mape_train = np.mean(np.abs((y_train - y_pred_train) / y_train)) * 100

        bins, mape_index_list = self.bins
        mid_bins = [(y_pred_test >= i[0]) & (y_pred_test < i[1]) for i in bins]

        bins_values = [y_pred_test < 1] + mid_bins + [y_pred_test >= 10 * 60]

        if save_model:
            mse_tests = [
                mean_squared_error(y_test[bin], y_pred_test[bin])
                for bin in bins_values
            ]

            observation_tests = [y_test[bin].shape[0] for bin in bins_values]

            mse_test_dic = dict(
                zip(mape_index_list, zip(observation_tests, mse_tests)))

            if self.verbose >= 2:
                self.logger.info(
                    f'''Computed mse on test set (with number of observations): {mse_test_dic}'''
                )

        if self.meta_algo == 'NN':
            if save_model:
                json_conf_path = f'''{get_path("models")}/{self.meta_algo}_{self.algo}_confint.json'''

                self.logger.info(
                    f'''Saving confint to {self.meta_algo}_{self.algo}_confint.json'''
                )

                with open(json_conf_path, 'w') as outfile:
                    json.dump(mse_test_dic, outfile)

        if self.verbose >= 2:
            rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
            rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

            self.logger.info(f'''
            MAPE on train set is: {mape_train}
            MAPE on test set is: {mape_test}
            RMSE on train set is {rmse_train}
            RMSE on test set is {rmse_test} ''')

        return meta_algo
Esempio n. 5
0
import argparse
import numpy as np

from scitime._model import Model
from scitime._utils import config

SUPPORTED_META_ALGOS = config('supported_meta_algos')
SUPPORTED_ALGOS = config('supported_algos')

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='''Gather & Persist
    Data of model training runtimes''')

    parser.add_argument('--drop_rate',
                        required=False,
                        default=0.999,
                        help='''drop rate of number of data generated
                         (from all param combinations taken from _config.json).
                         Default is 0.999''')

    parser.add_argument('--meta_algo',
                        required=False,
                        choices=SUPPORTED_META_ALGOS,
                        help='''meta algo used to
                         fit the meta model (NN or RF) - default is RF''')

    parser.add_argument('--verbose',
                        required=False,
                        default=1,
                        help='verbose mode (0, 1, 2 or 3)')
Esempio n. 6
0
    def _estimate(self, algo, X, y=None):
        """
        estimates the model's training time given that the fit starts

        :param X: np.array of inputs to be trained
        :param y: np.array of outputs to be trained
        (set to None if unsupervised algo)
        :param algo: algo whose runtime the user wants to predict
        :return: predicted runtime,
        low and high values of the confidence interval
        :rtype: tuple
        """
        # fetching sklearn model of the end user
        param_dic = self._fetch_algo_metadata(algo)
        algo_name = param_dic['name']

        if algo_name not in config("supported_algos"):
            raise NotImplementedError(f'''{algo_name} not currently supported by this package''')

        if self.meta_algo not in config('supported_meta_algos'):
            raise KeyError(f'''meta algo {self.meta_algo} currently not supported''')

        if self.verbose >= 3:
            self.logger.debug(f'''Fetching estimator: {self.meta_algo}_{algo_name}_estimator.pkl''')

        model_path = f'''{get_path("models")}/{self.meta_algo}_{algo_name}_estimator.pkl'''

        meta_estimator = joblib.load(model_path)

        # retrieving all parameters of interest:
        df = self._fetch_params(algo, X, y)

        # Transforming the inputs:
        if self.meta_algo == 'NN':
            meta_X = self._transform_params(algo, df, scaled=True)
        else:
            meta_X = self._transform_params(algo, df)

        prediction = max(np.float64(0), meta_estimator.predict(meta_X)[0])

        # if prediction from NN is too low, let's go back to RF
        if prediction < 1 and self.meta_algo == 'NN':
            if self.verbose >= 3:
                self.logger.debug('''NN prediction too low - fetching rf meta algo instead''')

                self.logger.debug(f'''Fetching estimator: RF_{algo_name}_estimator.pkl''')

            model_path = f'{get_path("models")}/RF_{algo_name}_estimator.pkl'
            meta_estimator = joblib.load(model_path)
            meta_X = self._transform_params(algo, df)
            prediction = meta_estimator.predict(meta_X)[0]

        lower_bound, upper_bound = \
            self._estimate_interval(meta_estimator,
                                    meta_X, algo_name, self.confidence)

        cleaned_prediction = self._clean_output(round(prediction))
        cleaned_lower_bound = self._clean_output(round(lower_bound))
        cleaned_upper_bound = self._clean_output(round(upper_bound))

        if self.verbose >= 1 and prediction < 1:
            self.logger.warning('''Your model predicted training runtime is very low - no need to use this package''')

        if prediction < 1:
            cleaned_prediction = f'{prediction} seconds'
            cleaned_lower_bound = f'{lower_bound} seconds'
            cleaned_upper_bound = f'{upper_bound} seconds'

        if self.verbose >= 2:
            self.logger.info(f'''Training your {algo_name} model should take ~ {cleaned_prediction}''')

            self.logger.info(f'''The {100 * self.confidence}% prediction interval is [{cleaned_lower_bound}, {cleaned_upper_bound}]''')

        return prediction, lower_bound, upper_bound