def test_component_graph_dataset_with_target_imputer():
    X = pd.DataFrame({
        'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'],
        'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3]
    })
    y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, np.nan])
    graph = {
        'Target Imputer': [TargetImputer],
        'OneHot': [OneHotEncoder, 'Target Imputer.x', 'Target Imputer.y'],
        'Random Forest':
        [RandomForestClassifier, 'OneHot.x', 'Target Imputer.y'],
        'Elastic Net': [ElasticNetClassifier, 'OneHot.x', 'Target Imputer.y'],
        'Logistic Regression': [
            LogisticRegressionClassifier, 'Random Forest', 'Elastic Net',
            'Target Imputer.y'
        ]
    }

    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    assert component_graph.get_parents('Target Imputer') == []
    assert component_graph.get_parents('OneHot') == [
        'Target Imputer.x', 'Target Imputer.y'
    ]
    assert component_graph.get_parents('Random Forest') == [
        'OneHot.x', 'Target Imputer.y'
    ]
    assert component_graph.get_parents('Elastic Net') == [
        'OneHot.x', 'Target Imputer.y'
    ]

    component_graph.fit(X, y)
    predictions = component_graph.predict(X)
    assert not pd.isnull(predictions.to_series()).any()
Esempio n. 2
0
def test_component_graph_dataset_with_different_types():
    # Checks that types are converted correctly by Woodwork. Specifically, the standard scaler
    # should convert column_3 to float, so our code to try to convert back to the original boolean type
    # will catch the TypeError thrown and not convert the column.
    graph = {'Imputer': [Imputer],
             'OneHot': [OneHotEncoder, 'Imputer.x'],
             'DateTime': [DateTimeFeaturizer, 'OneHot.x'],
             'Scaler': [StandardScaler, 'DateTime.x'],
             'Random Forest': [RandomForestClassifier, 'Scaler.x'],
             'Elastic Net': [ElasticNetClassifier, 'Scaler.x'],
             'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']}

    X = pd.DataFrame({'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'],
                      'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3],
                      'column_3': [True, False, True, False, True, False, True, False, False]})
    y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
    X = infer_feature_types(X, {"column_2": "categorical"})

    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    assert component_graph.input_feature_names == {}
    component_graph.fit(X, y)

    input_feature_names = component_graph.input_feature_names
    assert input_feature_names['Imputer'] == ['column_1', 'column_2', 'column_3']
    assert input_feature_names['OneHot'] == ['column_1', 'column_2', 'column_3']
    assert input_feature_names['DateTime'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d',
                                               'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6']
    assert input_feature_names['Scaler'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d',
                                             'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6']
    assert input_feature_names['Random Forest'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d',
                                                    'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6']
    assert input_feature_names['Elastic Net'] == ['column_3', 'column_1_a', 'column_1_b', 'column_1_c', 'column_1_d',
                                                  'column_2_1', 'column_2_2', 'column_2_3', 'column_2_4', 'column_2_5', 'column_2_6']
    assert input_feature_names['Logistic Regression'] == ['Random Forest', 'Elastic Net']
def test_get_last_component(example_graph):
    component_graph = ComponentGraph()
    with pytest.raises(ValueError,
                       match='Cannot get last component from edgeless graph'):
        component_graph.get_last_component()

    component_graph = ComponentGraph(example_graph)
    assert component_graph.get_last_component() == LogisticRegressionClassifier

    component_graph.instantiate({})
    assert component_graph.get_last_component(
    ) == LogisticRegressionClassifier()

    component_graph = ComponentGraph({'Imputer': [Imputer]})
    assert component_graph.get_last_component() == Imputer

    component_graph = ComponentGraph({
        'Imputer': [Imputer],
        'OneHot': [OneHotEncoder, 'Imputer']
    })
    assert component_graph.get_last_component() == OneHotEncoder

    component_graph = ComponentGraph({
        'Imputer': [Imputer],
        'OneHot': [OneHotEncoder]
    })
    with pytest.raises(ValueError,
                       match='Cannot get last component from edgeless graph'):
        component_graph.get_last_component()
def test_get_component(example_graph):
    graph = example_graph
    component_graph = ComponentGraph(graph)

    assert component_graph.get_component('OneHot_ElasticNet') == OneHotEncoder
    assert component_graph.get_component(
        'Logistic Regression') == LogisticRegressionClassifier

    with pytest.raises(ValueError, match='not in the graph'):
        component_graph.get_component('Fake Component')

    component_graph.instantiate({
        'OneHot_RandomForest': {
            'top_n': 3
        },
        'Random Forest': {
            'max_depth': 4,
            'n_estimators': 50
        }
    })
    assert component_graph.get_component(
        'OneHot_ElasticNet') == OneHotEncoder()
    assert component_graph.get_component(
        'OneHot_RandomForest') == OneHotEncoder(top_n=3)
    assert component_graph.get_component(
        'Random Forest') == RandomForestClassifier(n_estimators=50,
                                                   max_depth=4)
def test_predict_repeat_estimator(mock_predict, mock_fit, X_y_binary):
    X, y = X_y_binary
    mock_predict.return_value = ww.DataColumn(pd.Series(y))

    graph = {
        'Imputer': [Imputer],
        'OneHot_RandomForest': [OneHotEncoder, 'Imputer.x'],
        'OneHot_Logistic': [OneHotEncoder, 'Imputer.x'],
        'Random Forest': [RandomForestClassifier, 'OneHot_RandomForest.x'],
        'Logistic Regression':
        [LogisticRegressionClassifier, 'OneHot_Logistic.x'],
        'Final Estimator':
        [LogisticRegressionClassifier, 'Random Forest', 'Logistic Regression']
    }
    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    component_graph.fit(X, y)

    assert not component_graph.get_component(
        'Logistic Regression')._component_obj == component_graph.get_component(
            'Final Estimator')._component_obj

    component_graph.predict(X)
    assert mock_predict.call_count == 5
    assert mock_fit.call_count == 3
Esempio n. 6
0
def test_bad_instantiate_can_reinstantiate(example_graph):
    component_graph = ComponentGraph(example_graph)
    with pytest.raises(ValueError, match='Error received when instantiating component'):
        component_graph.instantiate(parameters={'Elastic Net': {'max_iter': 100, 'fake_param': None}})

    component_graph.instantiate({'Elastic Net': {'max_iter': 22}})
    assert component_graph.get_component('Elastic Net').parameters['max_iter'] == 22
def test_component_graph_sampler():
    graph = {
        'Imputer': [Imputer],
        'OneHot': [OneHotEncoder, 'Imputer.x'],
        'Undersampler': [Undersampler, 'OneHot.x'],
        'Random Forest':
        [RandomForestClassifier, 'Undersampler.x', 'Undersampler.y'],
        'Elastic Net':
        [ElasticNetClassifier, 'Undersampler.x', 'Undersampler.y'],
        'Logistic Regression':
        [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']
    }

    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    assert component_graph.get_parents('Imputer') == []
    assert component_graph.get_parents('OneHot') == ['Imputer.x']
    assert component_graph.get_parents('Undersampler') == ['OneHot.x']
    assert component_graph.get_parents('Random Forest') == [
        'Undersampler.x', 'Undersampler.y'
    ]
    assert component_graph.get_parents('Elastic Net') == [
        'Undersampler.x', 'Undersampler.y'
    ]
    assert component_graph.get_parents('Logistic Regression') == [
        'Random Forest', 'Elastic Net'
    ]
def test_input_feature_names(example_graph):
    X = pd.DataFrame({
        'column_1': ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', 'b'],
        'column_2': [1, 2, 3, 4, 5, 6, 5, 4, 3]
    })
    y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])

    component_graph = ComponentGraph(example_graph)
    component_graph.instantiate({
        'OneHot_RandomForest': {
            'top_n': 2
        },
        'OneHot_ElasticNet': {
            'top_n': 3
        }
    })
    assert component_graph.input_feature_names == {}
    component_graph.fit(X, y)

    input_feature_names = component_graph.input_feature_names
    assert input_feature_names['Imputer'] == ['column_1', 'column_2']
    assert input_feature_names['OneHot_RandomForest'] == [
        'column_1', 'column_2'
    ]
    assert input_feature_names['OneHot_ElasticNet'] == ['column_1', 'column_2']
    assert input_feature_names['Random Forest'] == [
        'column_2', 'column_1_a', 'column_1_b'
    ]
    assert input_feature_names['Elastic Net'] == [
        'column_2', 'column_1_a', 'column_1_b', 'column_1_c'
    ]
    assert input_feature_names['Logistic Regression'] == [
        'Random Forest', 'Elastic Net'
    ]
def test_predict_empty_graph(X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X)
    component_graph = ComponentGraph()
    component_graph.instantiate({})

    component_graph.fit(X, y)
    X_t = component_graph.predict(X)
    assert_frame_equal(X, X_t.to_dataframe())
Esempio n. 10
0
def test_instantiate_from_list():
    component_list = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier']
    component_graph = ComponentGraph().from_list(component_list)

    parameters = {'One Hot Encoder': {'top_n': 7}}
    component_graph.instantiate(parameters)
    assert isinstance(component_graph.get_component('Imputer'), Imputer)
    assert isinstance(component_graph.get_component('Random Forest Classifier'), RandomForestClassifier)
    assert component_graph.get_component('One Hot Encoder').parameters['top_n'] == 7
Esempio n. 11
0
def test_multiple_y_parents(mock_fit_transform, X_y_binary):
    X, y = X_y_binary
    graph = {'Imputer': [Imputer],
             'OHE': [OneHotEncoder, 'Imputer.x'],
             'Estimator': [RandomForestClassifier, 'Imputer.y', 'OHE.y']}
    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    mock_fit_transform.return_value = tuple((pd.DataFrame(X), pd.Series(y)))
    with pytest.raises(ValueError, match='Cannot have multiple `y` parents for a single component'):
        component_graph.fit(X, y)
Esempio n. 12
0
def test_instantiate_without_parameters(example_graph):
    graph = example_graph
    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    assert component_graph.get_component('OneHot_RandomForest').parameters['top_n'] == 10
    assert component_graph.get_component('OneHot_ElasticNet').parameters['top_n'] == 10
    assert component_graph.get_component('OneHot_RandomForest') is not component_graph.get_component('OneHot_ElasticNet')

    expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression']
    assert component_graph.compute_order == expected_order
Esempio n. 13
0
def test_get_estimators(example_graph):
    component_graph = ComponentGraph(example_graph)
    with pytest.raises(ValueError, match='Cannot get estimators until'):
        component_graph.get_estimators()

    component_graph.instantiate({})
    assert component_graph.get_estimators() == [RandomForestClassifier(), ElasticNetClassifier(), LogisticRegressionClassifier()]

    component_graph = ComponentGraph.from_list(['Imputer', 'One Hot Encoder'])
    component_graph.instantiate({})
    assert component_graph.get_estimators() == []
Esempio n. 14
0
def test_iteration(example_graph):
    component_graph = ComponentGraph(example_graph)

    expected = [Imputer, OneHotEncoder, ElasticNetClassifier, OneHotEncoder, RandomForestClassifier, LogisticRegressionClassifier]
    iteration = [component for component in component_graph]
    assert iteration == expected

    component_graph.instantiate({'OneHot_RandomForest': {'top_n': 32}})
    expected = [Imputer(), OneHotEncoder(), ElasticNetClassifier(), OneHotEncoder(top_n=32), RandomForestClassifier(), LogisticRegressionClassifier()]
    iteration = [component for component in component_graph]
    assert iteration == expected
Esempio n. 15
0
def test_fit_features_nonlinear(mock_predict, mock_fit, mock_fit_transform, example_graph, X_y_binary):
    X, y = X_y_binary
    component_graph = ComponentGraph(example_graph)
    component_graph.instantiate({})

    mock_X_t = ww.DataTable(np.ones(pd.DataFrame(X).shape))
    mock_fit_transform.return_value = mock_X_t
    mock_fit.return_value = Estimator
    mock_predict.return_value = ww.DataColumn(pd.Series(y))

    component_graph.fit_features(X, y)

    assert mock_fit_transform.call_count == 3
    assert mock_fit.call_count == 2
    assert mock_predict.call_count == 2
Esempio n. 16
0
def test_compute_final_component_features_linear(mock_ohe, mock_imputer, X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X)
    X_expected = X.fillna(0)
    mock_imputer.return_value = ww.DataTable(X)
    mock_ohe.return_value = ww.DataTable(X_expected)

    component_list = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier']
    component_graph = ComponentGraph().from_list(component_list)
    component_graph.instantiate({})
    component_graph.fit(X, y)

    X_t = component_graph.compute_final_component_features(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())
    assert mock_imputer.call_count == 2
    assert mock_ohe.call_count == 2
Esempio n. 17
0
def test_computation_input_custom_index(index):
    graph = {'OneHot': [OneHotEncoder],
             'Random Forest': [RandomForestClassifier, 'OneHot.x'],
             'Elastic Net': [ElasticNetClassifier, 'OneHot.x'],
             'Logistic Regression': [LogisticRegressionClassifier, 'Random Forest', 'Elastic Net']}

    X = pd.DataFrame({"categories": [f"cat_{i}" for i in range(5)], "numbers": np.arange(5)},
                     index=index)
    y = pd.Series([1, 2, 1, 2, 1])
    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    component_graph.fit(X, y)

    X_t = component_graph.predict(X).to_series()
    assert_index_equal(X_t.index, pd.RangeIndex(start=0, stop=5, step=1))
    assert not X_t.isna().any(axis=None)
Esempio n. 18
0
def test_component_graph_evaluation_plumbing(mock_transa, mock_transb, mock_transc, mock_preda, mock_predb, mock_predc, dummy_components):
    TransformerA, TransformerB, TransformerC, EstimatorA, EstimatorB, EstimatorC = dummy_components
    mock_transa.return_value = ww.DataTable(pd.DataFrame({'feature trans': [1, 0, 0, 0, 0, 0], 'feature a': np.ones(6)}))
    mock_transb.return_value = ww.DataTable(pd.DataFrame({'feature b': np.ones(6) * 2}))
    mock_transc.return_value = ww.DataTable(pd.DataFrame({'feature c': np.ones(6) * 3}))
    mock_preda.return_value = ww.DataColumn(pd.Series([0, 0, 0, 1, 0, 0]))
    mock_predb.return_value = ww.DataColumn(pd.Series([0, 0, 0, 0, 1, 0]))
    mock_predc.return_value = ww.DataColumn(pd.Series([0, 0, 0, 0, 0, 1]))
    graph = {
        'transformer a': [TransformerA],
        'transformer b': [TransformerB, 'transformer a'],
        'transformer c': [TransformerC, 'transformer a', 'transformer b'],
        'estimator a': [EstimatorA],
        'estimator b': [EstimatorB, 'transformer a'],
        'estimator c': [EstimatorC, 'transformer a', 'estimator a', 'transformer b', 'estimator b', 'transformer c']
    }
    component_graph = ComponentGraph(graph)
    component_graph.instantiate({})
    X = pd.DataFrame({'feature1': np.zeros(6), 'feature2': np.zeros(6)})
    y = pd.Series(np.zeros(6))
    component_graph.fit(X, y)
    predict_out = component_graph.predict(X)

    assert_frame_equal(mock_transa.call_args[0][0].to_dataframe(), X)
    assert_frame_equal(mock_transb.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"),
                                                                                 'feature a': np.ones(6)}, columns=['feature trans', 'feature a']))
    assert_frame_equal(mock_transc.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"),
                                                                                 'feature a': np.ones(6),
                                                                                 'feature b': np.ones(6) * 2},
                                                                                columns=['feature trans', 'feature a', 'feature b']))
    assert_frame_equal(mock_preda.call_args[0][0].to_dataframe(), X)
    assert_frame_equal(mock_predb.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"),
                                                                                'feature a': np.ones(6)},
                                                                               columns=['feature trans', 'feature a']))
    assert_frame_equal(mock_predc.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"),
                                                                                'feature a': np.ones(6),
                                                                                'estimator a': pd.Series([0, 0, 0, 1, 0, 0], dtype="Int64"),
                                                                                'feature b': np.ones(6) * 2,
                                                                                'estimator b': pd.Series([0, 0, 0, 0, 1, 0], dtype="Int64"),
                                                                                'feature c': np.ones(6) * 3},
                                                                               columns=['feature trans', 'feature a', 'estimator a', 'feature b', 'estimator b', 'feature c']))
    assert_series_equal(pd.Series([0, 0, 0, 0, 0, 1], dtype="Int64"), predict_out.to_series())
Esempio n. 19
0
def test_instantiate_with_parameters(example_graph):
    graph = example_graph
    component_graph = ComponentGraph(graph)

    assert not isinstance(component_graph.get_component('Imputer'), Imputer)
    assert not isinstance(component_graph.get_component('Elastic Net'), ElasticNetClassifier)

    parameters = {'OneHot_RandomForest': {'top_n': 3},
                  'OneHot_ElasticNet': {'top_n': 5},
                  'Elastic Net': {'max_iter': 100}}
    component_graph.instantiate(parameters)

    expected_order = ['Imputer', 'OneHot_ElasticNet', 'Elastic Net', 'OneHot_RandomForest', 'Random Forest', 'Logistic Regression']
    assert component_graph.compute_order == expected_order

    assert isinstance(component_graph.get_component('Imputer'), Imputer)
    assert isinstance(component_graph.get_component('Random Forest'), RandomForestClassifier)
    assert isinstance(component_graph.get_component('Logistic Regression'), LogisticRegressionClassifier)
    assert component_graph.get_component('OneHot_RandomForest').parameters['top_n'] == 3
    assert component_graph.get_component('OneHot_ElasticNet').parameters['top_n'] == 5
    assert component_graph.get_component('Elastic Net').parameters['max_iter'] == 100
def test_parents(example_graph):
    graph = example_graph
    component_graph = ComponentGraph(graph)

    assert component_graph.get_parents('Imputer') == []
    assert component_graph.get_parents('OneHot_RandomForest') == ['Imputer.x']
    assert component_graph.get_parents('OneHot_ElasticNet') == ['Imputer.x']
    assert component_graph.get_parents('Random Forest') == [
        'OneHot_RandomForest.x'
    ]
    assert component_graph.get_parents('Elastic Net') == [
        'OneHot_ElasticNet.x'
    ]
    assert component_graph.get_parents('Logistic Regression') == [
        'Random Forest', 'Elastic Net'
    ]

    with pytest.raises(ValueError, match='not in the graph'):
        component_graph.get_parents('Fake component')

    component_graph.instantiate({})
    assert component_graph.get_parents('Imputer') == []
    assert component_graph.get_parents('OneHot_RandomForest') == ['Imputer.x']
    assert component_graph.get_parents('OneHot_ElasticNet') == ['Imputer.x']
    assert component_graph.get_parents('Random Forest') == [
        'OneHot_RandomForest.x'
    ]
    assert component_graph.get_parents('Elastic Net') == [
        'OneHot_ElasticNet.x'
    ]
    assert component_graph.get_parents('Logistic Regression') == [
        'Random Forest', 'Elastic Net'
    ]

    with pytest.raises(ValueError, match='not in the graph'):
        component_graph.get_parents('Fake component')
Esempio n. 21
0
class PipelineBase(ABC, metaclass=PipelineBaseMeta):
    """Base class for all pipelines."""

    @property
    @classmethod
    @abstractmethod
    def component_graph(cls):
        """Returns list or dictionary of components representing pipeline graph structure

        Returns:
            list(str / ComponentBase subclass): List of ComponentBase subclasses or strings denotes graph structure of this pipeline
        """

    custom_hyperparameters = None
    custom_name = None
    problem_type = None

    def __init__(self, parameters, random_state=0):
        """Machine learning pipeline made out of transformers and a estimator.

        Required Class Variables:
            component_graph (list): List of components in order. Accepts strings or ComponentBase subclasses in the list

        Arguments:
            parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
                 An empty dictionary {} implies using all default values for component parameters.
            random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.
        """
        self.random_state = get_random_state(random_state)
        if isinstance(self.component_graph, list):  # Backwards compatibility
            self._component_graph = ComponentGraph().from_list(self.component_graph, random_state=self.random_state)
        else:
            self._component_graph = ComponentGraph(component_dict=self.component_graph, random_state=self.random_state)
        self._component_graph.instantiate(parameters)

        self.input_feature_names = {}
        self.input_target_name = None

        final_component = self._component_graph.get_last_component()
        self.estimator = final_component if isinstance(final_component, Estimator) else None
        self._estimator_name = self._component_graph.compute_order[-1] if self.estimator is not None else None

        self._validate_estimator_problem_type()
        self._is_fitted = False
        self._pipeline_params = parameters.get("pipeline", {})

    @classproperty
    def name(cls):
        """Returns a name describing the pipeline.
        By default, this will take the class name and add a space between each capitalized word (class name should be in Pascal Case). If the pipeline has a custom_name attribute, this will be returned instead.
        """
        if cls.custom_name:
            name = cls.custom_name
        else:
            rex = re.compile(r'(?<=[a-z])(?=[A-Z])')
            name = rex.sub(' ', cls.__name__)
            if name == cls.__name__:
                raise IllFormattedClassNameError("Pipeline Class {} needs to follow Pascal Case standards or `custom_name` must be defined.".format(cls.__name__))
        return name

    @classproperty
    def summary(cls):
        """Returns a short summary of the pipeline structure, describing the list of components used.
        Example: Logistic Regression Classifier w/ Simple Imputer + One Hot Encoder
        """
        component_graph = [handle_component_class(component_class) for component_class in copy.copy(cls.linearized_component_graph)]
        if len(component_graph) == 0:
            return "Empty Pipeline"
        summary = "Pipeline"
        component_graph[-1] = component_graph[-1]

        if inspect.isclass(component_graph[-1]) and issubclass(component_graph[-1], Estimator):
            estimator_class = component_graph.pop(-1)
            summary = estimator_class.name
        if len(component_graph) == 0:
            return summary
        component_names = [component_class.name for component_class in component_graph]
        return '{} w/ {}'.format(summary, ' + '.join(component_names))

    @classproperty
    def linearized_component_graph(cls):
        """Returns a component graph in list form. Note: this is not guaranteed to be in proper component computation order
        """
        if isinstance(cls.component_graph, list):
            return cls.component_graph
        else:
            return [component_info[0] for component_info in cls.component_graph.values()]

    def _validate_estimator_problem_type(self):
        """Validates this pipeline's problem_type against that of the estimator from `self.component_graph`"""
        if self.estimator is None:  # Allow for pipelines that do not end with an estimator
            return
        estimator_problem_types = self.estimator.supported_problem_types
        if self.problem_type not in estimator_problem_types:
            raise ValueError("Problem type {} not valid for this component graph. Valid problem types include {}."
                             .format(self.problem_type, estimator_problem_types))

    def __getitem__(self, index):
        if isinstance(index, slice):
            raise NotImplementedError('Slicing pipelines is currently not supported.')
        elif isinstance(index, int):
            component_name = self.component_graph[index]
            if not isinstance(component_name, str):
                component_name = component_name.name
            return self.get_component(component_name)
        else:
            return self.get_component(index)

    def __setitem__(self, index, value):
        raise NotImplementedError('Setting pipeline components is not supported.')

    def get_component(self, name):
        """Returns component by name

        Arguments:
            name (str): Name of component

        Returns:
            Component: Component to return

        """
        return self._component_graph.get_component(name)

    def describe(self):
        """Outputs pipeline details including component parameters

        Arguments:
            return_dict (bool): If True, return dictionary of information about pipeline. Defaults to false

        Returns:
            dict: Dictionary of all component parameters if return_dict is True, else None
        """
        log_title(logger, self.name)
        logger.info("Problem Type: {}".format(self.problem_type))
        logger.info("Model Family: {}".format(str(self.model_family)))

        if self._estimator_name in self.input_feature_names:
            logger.info("Number of features: {}".format(len(self.input_feature_names[self._estimator_name])))

        # Summary of steps
        log_subtitle(logger, "Pipeline Steps")
        for number, component in enumerate(self._component_graph, 1):
            component_string = str(number) + ". " + component.name
            logger.info(component_string)
            component.describe(print_name=False)

    def compute_estimator_features(self, X, y=None):
        """Transforms the data by applying all pre-processing components.

        Arguments:
            X (pd.DataFrame): Input data to the pipeline to transform.

        Returns:
            pd.DataFrame - New transformed features.
        """
        X_t = self._component_graph.compute_final_component_features(X, y=y)
        return X_t

    def _compute_features_during_fit(self, X, y):
        self.input_target_name = y.name
        X_t = self._component_graph.fit_features(X, y)
        self.input_feature_names = self._component_graph.input_feature_names
        return X_t

    def _fit(self, X, y):
        self.input_target_name = y.name
        self._component_graph.fit(X, y)
        self.input_feature_names = self._component_graph.input_feature_names

    @abstractmethod
    def fit(self, X, y):
        """Build a model

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
            y (ww.DataColumn, pd.Series, np.ndarray): The target training data of length [n_samples]

        Returns:
            self

        """

    def predict(self, X, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (ww.DataTable, pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
            objective (Object or string): The objective to use to make predictions

        Returns:
            pd.Series: Predicted values.
        """
        X = _convert_to_woodwork_structure(X)
        predictions = self._component_graph.predict(X)
        return predictions.rename(self.input_target_name)

    @abstractmethod
    def score(self, X, y, objectives):
        """Evaluate model performance on current and additional objectives

        Arguments:
            X (ww.DataTable, pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
            y (pd.Series, ww.DataColumn, or np.ndarray): True labels of length [n_samples]
            objectives (list): Non-empty list of objectives to score on

        Returns:
            dict: Ordered dictionary of objective scores
        """

    @staticmethod
    def _score(X, y, predictions, objective):
        return objective.score(y, predictions, X)

    def _score_all_objectives(self, X, y, y_pred, y_pred_proba, objectives):
        """Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score.

        Will raise a PipelineScoreError if any objectives fail.
        Arguments:
            X (pd.DataFrame): The feature matrix.
            y (pd.Series): The target data.
            y_pred (pd.Series): The pipeline predictions.
            y_pred_proba (pd.Dataframe, pd.Series, None): The predicted probabilities for classification problems.
                Will be a DataFrame for multiclass problems and Series otherwise. Will be None for regression problems.
            objectives (list): List of objectives to score.
        """
        scored_successfully = OrderedDict()
        exceptions = OrderedDict()
        for objective in objectives:
            try:
                if not objective.is_defined_for_problem_type(self.problem_type):
                    raise ValueError(f'Invalid objective {objective.name} specified for problem type {self.problem_type}')
                score = self._score(X, y, y_pred_proba if objective.score_needs_proba else y_pred, objective)
                scored_successfully.update({objective.name: score})
            except Exception as e:
                tb = traceback.format_tb(sys.exc_info()[2])
                exceptions[objective.name] = (e, tb)
        if exceptions:
            # If any objective failed, throw an PipelineScoreError
            raise PipelineScoreError(exceptions, scored_successfully)
        # No objectives failed, return the scores
        return scored_successfully

    @classproperty
    def model_family(cls):
        "Returns model family of this pipeline template"""
        component_graph = copy.copy(cls.component_graph)
        if isinstance(component_graph, list):
            return handle_component_class(component_graph[-1]).model_family
        else:
            order = ComponentGraph.generate_order(component_graph)
            final_component = order[-1]
            return handle_component_class(component_graph[final_component][0]).model_family

    @classproperty
    def hyperparameters(cls):
        "Returns hyperparameter ranges from all components as a dictionary"
        hyperparameter_ranges = dict()
        component_graph = copy.copy(cls.component_graph)
        if isinstance(component_graph, list):
            for component_class in component_graph:
                component_class = handle_component_class(component_class)
                component_hyperparameters = copy.copy(component_class.hyperparameter_ranges)
                if cls.custom_hyperparameters and component_class.name in cls.custom_hyperparameters:
                    component_hyperparameters.update(cls.custom_hyperparameters.get(component_class.name, {}))
                hyperparameter_ranges[component_class.name] = component_hyperparameters
        else:
            for component_name, component_info in component_graph.items():
                component_class = handle_component_class(component_info[0])
                component_hyperparameters = copy.copy(component_class.hyperparameter_ranges)
                if cls.custom_hyperparameters and component_name in cls.custom_hyperparameters:
                    component_hyperparameters.update(cls.custom_hyperparameters.get(component_name, {}))
                hyperparameter_ranges[component_name] = component_hyperparameters
        return hyperparameter_ranges

    @property
    def parameters(self):
        """Returns parameter dictionary for this pipeline

        Returns:
            dict: Dictionary of all component parameters
        """
        components = [(component_name, component_class) for component_name, component_class in self._component_graph.component_instances.items()]
        component_parameters = {c_name: copy.copy(c.parameters) for c_name, c in components if c.parameters}
        if self._pipeline_params:
            component_parameters['pipeline'] = self._pipeline_params
        return component_parameters

    @classproperty
    def default_parameters(cls):
        """Returns the default parameter dictionary for this pipeline.

        Returns:
            dict: Dictionary of all component default parameters.
        """
        defaults = {}
        for c in cls.component_graph:
            component = handle_component_class(c)
            if component.default_parameters:
                defaults[component.name] = component.default_parameters
        return defaults

    @property
    def feature_importance(self):
        """Return importance associated with each feature. Features dropped by the feature selection are excluded.

        Returns:
            pd.DataFrame including feature names and their corresponding importance
        """
        feature_names = self.input_feature_names[self._estimator_name]
        importance = list(zip(feature_names, self.estimator.feature_importance))  # note: this only works for binary
        importance.sort(key=lambda x: -abs(x[1]))
        df = pd.DataFrame(importance, columns=["feature", "importance"])
        return df

    def graph(self, filepath=None):
        """Generate an image representing the pipeline graph

        Arguments:
            filepath (str, optional): Path to where the graph should be saved. If set to None (as by default), the graph will not be saved.

        Returns:
            graphviz.Digraph: Graph object that can be directly displayed in Jupyter notebooks.
        """
        graphviz = import_or_raise('graphviz', error_msg='Please install graphviz to visualize pipelines.')

        # Try rendering a dummy graph to see if a working backend is installed
        try:
            graphviz.Digraph().pipe()
        except graphviz.backend.ExecutableNotFound:
            raise RuntimeError(
                "To graph entity sets, a graphviz backend is required.\n" +
                "Install the backend using one of the following commands:\n" +
                "  Mac OS: brew install graphviz\n" +
                "  Linux (Ubuntu): sudo apt-get install graphviz\n" +
                "  Windows: conda install python-graphviz\n"
            )

        graph_format = None
        path_and_name = None
        if filepath:
            # Explicitly cast to str in case a Path object was passed in
            filepath = str(filepath)
            try:
                f = open(filepath, 'w')
                f.close()
            except (IOError, FileNotFoundError):
                raise ValueError(('Specified filepath is not writeable: {}'.format(filepath)))
            path_and_name, graph_format = os.path.splitext(filepath)
            graph_format = graph_format[1:].lower()  # ignore the dot
            supported_filetypes = graphviz.backend.FORMATS
            if graph_format not in supported_filetypes:
                raise ValueError(("Unknown format '{}'. Make sure your format is one of the " +
                                  "following: {}").format(graph_format, supported_filetypes))

        graph = self._component_graph.graph(path_and_name, graph_format)

        if filepath:
            graph.render(path_and_name, cleanup=True)

        return graph

    def graph_feature_importance(self, importance_threshold=0):
        """Generate a bar graph of the pipeline's feature importance

        Arguments:
            importance_threshold (float, optional): If provided, graph features with a permutation importance whose absolute value is larger than importance_threshold. Defaults to zero.

        Returns:
            plotly.Figure, a bar graph showing features and their corresponding importance
        """
        go = import_or_raise("plotly.graph_objects", error_msg="Cannot find dependency plotly.graph_objects")
        if jupyter_check():
            import_or_raise("ipywidgets", warning=True)

        feat_imp = self.feature_importance
        feat_imp['importance'] = abs(feat_imp['importance'])

        if importance_threshold < 0:
            raise ValueError(f'Provided importance threshold of {importance_threshold} must be greater than or equal to 0')

        # Remove features with importance whose absolute value is less than importance threshold
        feat_imp = feat_imp[feat_imp['importance'] >= importance_threshold]

        # List is reversed to go from ascending order to descending order
        feat_imp = feat_imp.iloc[::-1]

        title = 'Feature Importance'
        subtitle = 'May display fewer features due to feature selection'
        data = [go.Bar(
            x=feat_imp['importance'],
            y=feat_imp['feature'],
            orientation='h'
        )]

        layout = {
            'title': '{0}<br><sub>{1}</sub>'.format(title, subtitle),
            'height': 800,
            'xaxis_title': 'Feature Importance',
            'yaxis_title': 'Feature',
            'yaxis': {
                'type': 'category'
            }
        }

        fig = go.Figure(data=data, layout=layout)
        return fig

    def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL):
        """Saves pipeline at file path

        Arguments:
            file_path (str): location to save file
            pickle_protocol (int): the pickle data stream format.

        Returns:
            None
        """
        with open(file_path, 'wb') as f:
            cloudpickle.dump(self, f, protocol=pickle_protocol)

    @staticmethod
    def load(file_path):
        """Loads pipeline at file path

        Arguments:
            file_path (str): location to load file

        Returns:
            PipelineBase object
        """
        with open(file_path, 'rb') as f:
            return cloudpickle.load(f)

    def clone(self, random_state=0):
        """Constructs a new pipeline with the same parameters and components.

        Arguments:
            random_state (int): the value to seed the random state with. Can also be a RandomState instance. Defaults to 0.

        Returns:
            A new instance of this pipeline with identical parameters and components
        """
        return self.__class__(self.parameters, random_state=random_state)

    def __eq__(self, other):
        if not isinstance(other, self.__class__):
            return False
        random_state_eq = check_random_state_equality(self.random_state, other.random_state)
        if not random_state_eq:
            return False
        attributes_to_check = ['parameters', '_is_fitted', 'component_graph', 'input_feature_names', 'input_target_name']
        for attribute in attributes_to_check:
            if getattr(self, attribute) != getattr(other, attribute):
                return False
        return True

    def __str__(self):
        return self.name

    def __repr__(self):

        def repr_component(parameters):
            return ', '.join([f"'{key}': {safe_repr(value)}" for key, value in parameters.items()])

        parameters_repr = ' '.join([f"'{component}':{{{repr_component(parameters)}}}," for component, parameters in self.parameters.items()])
        return f'{(type(self).__name__)}(parameters={{{parameters_repr}}})'
def test_reinstantiate(example_graph):
    component_graph = ComponentGraph(example_graph)
    component_graph.instantiate({})
    with pytest.raises(ValueError,
                       match='Cannot reinstantiate a component graph'):
        component_graph.instantiate({'OneHot': {'top_n': 7}})