Ejemplo n.º 1
0
    def __init__(self,
                 allowed_pipelines=None,
                 max_iterations=None,
                 tuner_class=None,
                 random_state=0):
        """This class represents an automated machine learning (AutoML) algorithm. It encapsulates the decision-making logic behind an automl search, by both deciding which pipelines to evaluate next and by deciding what set of parameters to configure the pipeline with.

        To use this interface, you must define a next_batch method which returns the next group of pipelines to evaluate on the training data. That method may access state and results recorded from the previous batches, although that information is not tracked in a general way in this base class. Overriding add_result is a convenient way to record pipeline evaluation info if necessary.

        Arguments:
            allowed_pipelines (list(class)): A list of PipelineBase subclasses indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed.
            max_iterations (int): The maximum number of iterations to be evaluated.
            tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used.
            random_state (int): Seed for the random number generator. Defaults to 0.
        """
        self.random_state = get_random_seed(random_state)
        self.allowed_pipelines = allowed_pipelines or []
        self.max_iterations = max_iterations
        self._tuner_class = tuner_class or SKOptTuner
        self._tuners = {}
        for p in self.allowed_pipelines:
            self._tuners[p.name] = self._tuner_class(
                p.hyperparameters, random_state=self.random_state)
        self._pipeline_number = 0
        self._batch_number = 0
Ejemplo n.º 2
0
    def __init__(self,
                 n_estimators=10,
                 eta=0.03,
                 max_depth=6,
                 bootstrap_type=None,
                 silent=False,
                 allow_writing_files=False,
                 random_state=0,
                 **kwargs):
        random_seed = get_random_seed(random_state, self.SEED_MIN,
                                      self.SEED_MAX)
        parameters = {
            "n_estimators": n_estimators,
            "eta": eta,
            "max_depth": max_depth,
            'bootstrap_type': bootstrap_type,
            'silent': silent,
            'allow_writing_files': allow_writing_files
        }
        parameters.update(kwargs)

        cb_error_msg = "catboost is not installed. Please install using `pip install catboost.`"
        catboost = import_or_raise("catboost", error_msg=cb_error_msg)
        # catboost will choose an intelligent default for bootstrap_type, so only set if provided
        cb_parameters = copy.copy(parameters)
        if bootstrap_type is None:
            cb_parameters.pop('bootstrap_type')
        cb_regressor = catboost.CatBoostRegressor(**cb_parameters,
                                                  random_seed=random_seed)
        super().__init__(parameters=parameters,
                         component_obj=cb_regressor,
                         random_state=random_state)
Ejemplo n.º 3
0
def test_more_top_n_unique_values_large():
    X = pd.DataFrame({
        "col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
        "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"],
        "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"],
        "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]
    })

    random_seed = 2
    test_random_state = get_random_seed(random_seed)

    encoder = OneHotEncoder(top_n=3, random_state=random_seed)
    encoder.fit(X)
    X_t = encoder.transform(X)

    # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too
    X = _convert_to_woodwork_structure(X)
    X = _convert_woodwork_types_wrapper(X.to_dataframe())
    col_1_counts = X["col_1"].value_counts(dropna=False).to_frame()
    col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state)
    col_1_counts = col_1_counts.sort_values(["col_1"],
                                            ascending=False,
                                            kind='mergesort')
    col_1_samples = col_1_counts.head(
        encoder.parameters['top_n']).index.tolist()
    expected_col_names = set([
        "col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c",
        "col_4"
    ])
    for val in col_1_samples:
        expected_col_names.add("col_1_" + val)

    col_names = set(X_t.columns)
    assert (col_names == expected_col_names)
Ejemplo n.º 4
0
    def __init__(self, parameters, random_state=0):
        """Machine learning pipeline made out of transformers and a estimator.

        Required Class Variables:
            component_graph (list): List of components in order. Accepts strings or ComponentBase subclasses in the list

        Arguments:
            parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
                 An empty dictionary {} implies using all default values for component parameters.
            random_state (int): Seed for the random number generator. Defaults to 0.
        """
        self.random_state = get_random_seed(random_state)
        if isinstance(self.component_graph, list):  # Backwards compatibility
            self._component_graph = ComponentGraph().from_list(self.component_graph, random_state=self.random_state)
        else:
            self._component_graph = ComponentGraph(component_dict=self.component_graph, random_state=self.random_state)
        self._component_graph.instantiate(parameters)

        self.input_feature_names = {}
        self.input_target_name = None

        final_component = self._component_graph.get_last_component()
        self.estimator = final_component if isinstance(final_component, Estimator) else None
        self._estimator_name = self._component_graph.compute_order[-1] if self.estimator is not None else None

        self._validate_estimator_problem_type()
        self._is_fitted = False
        self._pipeline_params = parameters.get("pipeline", {})
Ejemplo n.º 5
0
    def __init__(self, boosting_type="gbdt", learning_rate=0.1, n_estimators=100, max_depth=0, num_leaves=31, min_child_samples=20, n_jobs=-1, random_state=0, bagging_fraction=0.9, bagging_freq=0, **kwargs):
        random_seed = get_random_seed(random_state, self.SEED_MIN, self.SEED_MAX)

        parameters = {"boosting_type": boosting_type,
                      "learning_rate": learning_rate,
                      "n_estimators": n_estimators,
                      "max_depth": max_depth,
                      "num_leaves": num_leaves,
                      "min_child_samples": min_child_samples,
                      "n_jobs": n_jobs,
                      "bagging_freq": bagging_freq,
                      "bagging_fraction": bagging_fraction}
        parameters.update(kwargs)
        lg_parameters = copy.copy(parameters)
        # when boosting type is random forest (rf), LightGBM requires bagging_freq == 1 and  0 < bagging_fraction < 1.0
        if boosting_type == "rf":
            lg_parameters['bagging_freq'] = 1
        # when boosting type is goss, LightGBM requires bagging_fraction == 1
        elif boosting_type == "goss":
            lg_parameters['bagging_fraction'] = 1
        # avoid lightgbm warnings having to do with parameter aliases
        if lg_parameters['bagging_freq'] is not None or lg_parameters['bagging_fraction'] is not None:
            lg_parameters.update({'subsample': None, 'subsample_freq': None})

        lgbm_error_msg = "LightGBM is not installed. Please install using `pip install lightgbm`."
        lgbm = import_or_raise("lightgbm", error_msg=lgbm_error_msg)
        self._ordinal_encoder = None
        self._label_encoder = None

        lgbm_classifier = lgbm.sklearn.LGBMClassifier(random_state=random_seed, **lg_parameters)

        super().__init__(parameters=parameters,
                         component_obj=lgbm_classifier,
                         random_state=random_seed)
Ejemplo n.º 6
0
 def __init__(self,
              parameters=None,
              component_obj=None,
              random_state=0,
              **kwargs):
     self.random_state = get_random_seed(random_state)
     self._component_obj = component_obj
     self._parameters = parameters or {}
     self._is_fitted = False
Ejemplo n.º 7
0
    def __init__(self, eta=0.1, max_depth=6, min_child_weight=1, n_estimators=100, random_state=0, **kwargs):
        random_seed = get_random_seed(random_state, self.SEED_MIN, self.SEED_MAX)
        parameters = {"eta": eta,
                      "max_depth": max_depth,
                      "min_child_weight": min_child_weight,
                      "n_estimators": n_estimators}
        parameters.update(kwargs)
        xgb_error_msg = "XGBoost is not installed. Please install using `pip install xgboost.`"
        xgb = import_or_raise("xgboost", error_msg=xgb_error_msg)
        xgb_classifier = xgb.XGBClassifier(**parameters,
                                           random_state=random_seed)

        super().__init__(parameters=parameters,
                         component_obj=xgb_classifier,
                         random_state=random_state)
Ejemplo n.º 8
0
def test_ohe_top_n_categories_always_the_same():
    df = pd.DataFrame({
        "categories": ["cat_1"] * 5 + ["cat_2"] * 4 + ["cat_3"] * 3 +
        ["cat_4"] * 3 + ["cat_5"] * 3,
        "numbers":
        range(18)
    })

    def check_df_equality(random_seed):
        ohe = OneHotEncoder(top_n=4, random_seed=random_seed)
        df1 = ohe.fit_transform(df).to_dataframe()
        df2 = ohe.fit_transform(df).to_dataframe()
        assert_frame_equal(df1, df2)

    check_df_equality(5)
    check_df_equality(get_random_seed(5))
Ejemplo n.º 9
0
    def __init__(self, component_dict=None, random_state=0):
        """ Initializes a component graph for a pipeline as a directed acyclic graph (DAG).

        Example:
            >>> component_dict = {'imputer': ['Imputer'], 'ohe': ['One Hot Encoder', 'imputer.x'], 'estimator_1': ['Random Forest Classifier', 'ohe.x'], 'estimator_2': ['Decision Tree Classifier', 'ohe.x'], 'final': ['Logistic Regression Classifier', 'estimator_1', 'estimator_2']}
            >>> component_graph = ComponentGraph(component_dict)
           """
        self.random_state = get_random_seed(random_state)
        self.component_dict = component_dict or {}
        self.component_instances = {}
        self._is_instantiated = False
        for component_name, component_info in self.component_dict.items():
            if not isinstance(component_info, list):
                raise ValueError(
                    'All component information should be passed in as a list')
            component_class = handle_component_class(component_info[0])
            self.component_instances[component_name] = component_class
        self.compute_order = self.generate_order(self.component_dict)
        self.input_feature_names = {}
        self._i = 0