def test_outliers_data_check_init(): outliers_check = OutliersDataCheck() assert outliers_check.random_state.get_state()[0] == get_random_state( 0).get_state()[0] outliers_check = OutliersDataCheck(random_state=2) assert outliers_check.random_state.get_state()[0] == get_random_state( 2).get_state()[0]
def predict(self, X): X = infer_feature_types(X) strategy = self.parameters["strategy"] if strategy == "mode": predictions = pd.Series([self._mode] * len(X)) elif strategy == "random": predictions = get_random_state(self.random_seed).choice( self._classes, len(X)) else: predictions = get_random_state(self.random_seed).choice( self._classes, len(X), p=self._percentage_freq) return infer_feature_types(predictions)
def __init__(self, random_state=0): """Checks if there are any outliers in the input data. Arguments: random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. """ self.random_state = get_random_state(random_state)
def test_baseline_binary_random_weighted(X_y_binary): X, y = X_y_binary values, counts = np.unique(y, return_counts=True) percent_freq = counts.astype(float) / len(y) assert percent_freq.sum() == 1.0 clf = BaselineClassifier(strategy="random_weighted", random_state=0) clf.fit(X, y) assert clf.classes_ == [0, 1] expected_predictions = pd.Series(get_random_state(0).choice( np.unique(y), len(X), p=percent_freq), dtype="Int64") predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions.to_series()) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 2) expected_predictions_proba = pd.DataFrame( np.array([[percent_freq[i] for i in range(len(values))]] * len(X))) assert_frame_equal(expected_predictions_proba, predicted_proba.to_dataframe()) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_more_top_n_unique_values_large(): X = pd.DataFrame({"col_1": ["a", "b", "c", "d", "e", "f", "g", "h", "i"], "col_2": ["a", "a", "a", "b", "b", "c", "c", "d", "e"], "col_3": ["a", "a", "a", "b", "b", "b", "c", "c", "d"], "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1]}) random_seed = 2 test_random_state = get_random_state(random_seed) encoder = OneHotEncoder(top_n=3, random_state=random_seed) encoder.fit(X) X_t = encoder.transform(X) # Conversion changes the resulting dataframe dtype, resulting in a different random state, so we need make the conversion here too X = _convert_to_woodwork_structure(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) col_1_counts = X["col_1"].value_counts(dropna=False).to_frame() col_1_counts = col_1_counts.sample(frac=1, random_state=test_random_state) col_1_counts = col_1_counts.sort_values(["col_1"], ascending=False, kind='mergesort') col_1_samples = col_1_counts.head(encoder.parameters['top_n']).index.tolist() expected_col_names = set(["col_2_a", "col_2_b", "col_2_c", "col_3_a", "col_3_b", "col_3_c", "col_4"]) for val in col_1_samples: expected_col_names.add("col_1_" + val) col_names = set(X_t.columns) assert (col_names == expected_col_names)
def test_xgboost_feature_name_with_random_ascii(problem_type, X_y_binary, X_y_multi): clf = XGBoostClassifier() if problem_type == ProblemTypes.BINARY: X, y = X_y_binary expected_cols = 2 elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi expected_cols = 3 X = get_random_state(clf.random_state).random( (X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) clf.fit(X, y) predictions = clf.predict(X) assert len(predictions) == len(y) assert not np.isnan(predictions.to_series()).all() predictions = clf.predict_proba(X) assert predictions.shape == (len(y), expected_cols) assert not np.isnan(predictions.to_dataframe()).all().all() assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all()
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_regression, helper_functions): for estimator_class in _all_estimators_used_in_search(): supported_problem_types = [ handle_problem_types(pt) for pt in estimator_class.supported_problem_types ] for problem_type in supported_problem_types: clf = helper_functions.safe_init_component_with_njobs_1( estimator_class) if problem_type == ProblemTypes.BINARY: X, y = X_y_binary elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression X = get_random_state(clf.random_state).random( (X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) clf.fit(X, y) assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all() predictions = clf.predict(X).to_series() assert len(predictions) == len(y) assert not np.isnan(predictions).all()
def __init__(self, pipeline_hyperparameter_ranges, random_seed=0, with_replacement=False, replacement_max_attempts=10): """ Sets up check for duplication if needed. Arguments: pipeline_hyperparameter_ranges (dict): a set of hyperparameter ranges corresponding to a pipeline's parameters random_state (int): Unused in this class. Defaults to 0. with_replacement (bool): If false, only unique hyperparameters will be shown replacement_max_attempts (int): The maximum number of tries to get a unique set of random parameters. Only used if tuner is initalized with with_replacement=True random_seed (int): Seed for random number generator. Defaults to 0. """ super().__init__(pipeline_hyperparameter_ranges, random_seed=random_seed) self._space = Space(self._search_space_ranges) self._random_state = get_random_state(random_seed) self._with_replacement = with_replacement self._replacement_max_attempts = replacement_max_attempts self._used_parameters = set() self._used_parameters.add(()) self.curr_params = None
def __init__(self, parameters, random_state=0): """Machine learning pipeline made out of transformers and a estimator. Required Class Variables: component_graph (list): List of components in order. Accepts strings or ComponentBase subclasses in the list Arguments: parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary {} implies using all default values for component parameters. random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. """ self.random_state = get_random_state(random_state) if isinstance(self.component_graph, list): # Backwards compatibility self._component_graph = ComponentGraph().from_list(self.component_graph, random_state=self.random_state) else: self._component_graph = ComponentGraph(component_dict=self.component_graph, random_state=self.random_state) self._component_graph.instantiate(parameters) self.input_feature_names = {} self.input_target_name = None final_component = self._component_graph.get_last_component() self.estimator = final_component if isinstance(final_component, Estimator) else None self._estimator_name = self._component_graph.compute_order[-1] if self.estimator is not None else None self._validate_estimator_problem_type() self._is_fitted = False self._pipeline_params = parameters.get("pipeline", {})
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_regression, ts_data, helper_functions): for estimator_class in _all_estimators_used_in_search(): if estimator_class.__name__ == 'ARIMARegressor': continue supported_problem_types = [ handle_problem_types(pt) for pt in estimator_class.supported_problem_types ] for problem_type in supported_problem_types: clf = helper_functions.safe_init_component_with_njobs_1( estimator_class) if is_binary(problem_type): X, y = X_y_binary elif is_multiclass(problem_type): X, y = X_y_multi elif is_regression(problem_type): X, y = X_y_regression X = get_random_state(clf.random_seed).random( (X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) assert clf.input_feature_names is None clf.fit(X, y) assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all() predictions = clf.predict(X).to_series() assert len(predictions) == len(y) assert not np.isnan(predictions).all() assert (clf.input_feature_names == col_names)
def __init__(self, allowed_pipelines=None, max_iterations=None, tuner_class=None, random_state=0): """This class represents an automated machine learning (AutoML) algorithm. It encapsulates the decision-making logic behind an automl search, by both deciding which pipelines to evaluate next and by deciding what set of parameters to configure the pipeline with. To use this interface, you must define a next_batch method which returns the next group of pipelines to evaluate on the training data. That method may access state and results recorded from the previous batches, although that information is not tracked in a general way in this base class. Overriding add_result is a convenient way to record pipeline evaluation info if necessary. Arguments: allowed_pipelines (list(class)): A list of PipelineBase subclasses indicating the pipelines allowed in the search. The default of None indicates all pipelines for this problem type are allowed. max_iterations (int): The maximum number of iterations to be evaluated. tuner_class (class): A subclass of Tuner, to be used to find parameters for each pipeline. The default of None indicates the SKOptTuner will be used. random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. """ self.random_state = get_random_state(random_state) self.allowed_pipelines = allowed_pipelines or [] self.max_iterations = max_iterations self._tuner_class = tuner_class or SKOptTuner self._tuners = {} for p in self.allowed_pipelines: self._tuners[p.name] = self._tuner_class( p.hyperparameters, random_state=self.random_state) self._pipeline_number = 0 self._batch_number = 0
def test_ohe_top_n_categories_always_the_same(): df = pd.DataFrame({"categories": ["cat_1"] * 5 + ["cat_2"] * 4 + ["cat_3"] * 3 + ["cat_4"] * 3 + ["cat_5"] * 3, "numbers": range(18)}) def check_df_equality(random_state): ohe = OneHotEncoder(top_n=4, random_state=random_state) df1 = ohe.fit_transform(df) df2 = ohe.fit_transform(df) pd.testing.assert_frame_equal(df1, df2) check_df_equality(5) check_df_equality(get_random_state(5))
def test_baseline_multiclass_random(X_y_multi): X, y = X_y_multi values = np.unique(y) clf = BaselineClassifier(strategy="random", random_seed=0) clf.fit(X, y) assert clf.classes_ == [0, 1, 2] expected_predictions = pd.Series(get_random_state(0).choice(np.unique(y), len(X)), dtype="Int64") predictions = clf.predict(X) assert_series_equal(expected_predictions, predictions.to_series()) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) assert_frame_equal(pd.DataFrame(np.array([[1. / 3 for i in range(len(values))]] * len(X))), predicted_proba.to_dataframe()) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_xgboost_feature_name_with_random_ascii(X_y_regression): X, y = X_y_regression clf = XGBoostRegressor() X = get_random_state(clf.random_state).random( (X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) clf.fit(X, y) predictions = clf.predict(X) assert len(predictions) == len(y) assert not np.isnan(predictions.to_series()).all() assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all()
def test_baseline_binary_random(X_y_binary): X, y = X_y_binary values = np.unique(y) clf = BaselineClassifier(strategy="random", random_state=0) clf.fit(X, y) assert clf.classes_ == [0, 1] np.testing.assert_allclose( clf.predict(X), get_random_state(0).choice(np.unique(y), len(X))) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 2) np.testing.assert_allclose( predicted_proba, np.array([[0.5 for i in range(len(values))]] * len(X))) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def test_baseline_multi_random(X_y_multi): X, y = X_y_multi values = np.unique(y) parameters = {"Baseline Classifier": {"strategy": "random"}} clf = BaselineMulticlassPipeline(parameters=parameters) clf.fit(X, y) predicted_proba = clf.predict_proba(X) np.testing.assert_allclose( clf.predict(X), get_random_state(0).choice(np.unique(y), len(X))) assert predicted_proba.shape == (len(X), 3) np.testing.assert_allclose( predicted_proba, np.array([[1. / 3 for i in range(len(values))]] * len(X))) np.testing.assert_allclose(clf.feature_importance.iloc[:, 1], np.array([0.0] * X.shape[1]))
def test_baseline_binary_random_weighted(X_y_binary): X, y = X_y_binary values, counts = np.unique(y, return_counts=True) percent_freq = counts.astype(float) / len(y) assert percent_freq.sum() == 1.0 clf = BaselineClassifier(strategy="random_weighted", random_state=0) clf.fit(X, y) assert clf.classes_ == [0, 1] np.testing.assert_allclose( clf.predict(X), get_random_state(0).choice(np.unique(y), len(X), p=percent_freq)) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 2) np.testing.assert_allclose( predicted_proba, np.array([[percent_freq[i] for i in range(len(values))]] * len(X))) np.testing.assert_allclose(clf.feature_importance, np.array([0.0] * X.shape[1]))
def __init__(self, component_dict=None, random_state=0): """ Initializes a component graph for a pipeline as a directed acyclic graph (DAG). Example: >>> component_dict = {'imputer': ['Imputer'], 'ohe': ['One Hot Encoder', 'imputer.x'], 'estimator_1': ['Random Forest Classifier', 'ohe.x'], 'estimator_2': ['Decision Tree Classifier', 'ohe.x'], 'final': ['Logistic Regression Classifier', 'estimator_1', 'estimator_2']} >>> component_graph = ComponentGraph(component_dict) """ self.random_state = get_random_state(random_state) self.component_dict = component_dict or {} self.component_instances = {} self._is_instantiated = False for component_name, component_info in self.component_dict.items(): if not isinstance(component_info, list): raise ValueError( 'All component information should be passed in as a list') component_class = handle_component_class(component_info[0]) self.component_instances[component_name] = component_class self.compute_order = self.generate_order(self.component_dict) self.input_feature_names = {}
def test_baseline_multi_random_weighted(X_y_multi): X, y = X_y_multi values, counts = np.unique(y, return_counts=True) percent_freq = counts.astype(float) / len(y) assert percent_freq.sum() == 1.0 parameters = {"Baseline Classifier": {"strategy": "random_weighted"}} clf = BaselineMulticlassPipeline(parameters=parameters) clf.fit(X, y) predicted_proba = clf.predict_proba(X) np.testing.assert_allclose( clf.predict(X), get_random_state(0).choice(np.unique(y), len(X), p=percent_freq)) assert predicted_proba.shape == (len(X), 3) np.testing.assert_allclose( predicted_proba, np.array([[percent_freq[i] for i in range(len(values))]] * len(X))) np.testing.assert_allclose(clf.feature_importance.iloc[:, 1], np.array([0.0] * X.shape[1]))
def test_baseline_multi_random(X_y_multi): X, y = X_y_multi values = np.unique(y) parameters = {"Baseline Classifier": {"strategy": "random"}} clf = BaselineMulticlassPipeline(parameters=parameters) clf.fit(X, y) expected_predictions = pd.Series(get_random_state(0).choice( np.unique(y), len(X)), dtype="Int64") assert_series_equal(expected_predictions, clf.predict(X).to_series()) predicted_proba = clf.predict_proba(X) assert predicted_proba.shape == (len(X), 3) expected_predictions_proba = pd.DataFrame( np.array([[1. / 3 for i in range(len(values))]] * len(X))) assert_frame_equal(expected_predictions_proba, predicted_proba.to_dataframe()) np.testing.assert_allclose(clf.feature_importance.iloc[:, 1], np.array([0.0] * X.shape[1]))
def __init__(self, parameters=None, component_obj=None, random_state=0, **kwargs): self.random_state = get_random_state(random_state) self._component_obj = component_obj self._parameters = parameters or {} self._is_fitted = False