def test_graphpipeline_no_concat(): gpipeline = GraphPipeline( { "A": DebugPassThrough(debug=True), "B": DebugPassThrough(debug=True), "C": DebugPassThrough(debug=True) }, edges=[("A", "C"), ("B", "C")], no_concat_nodes={"C"}, ) Xtransformed = gpipeline.fit_transform(X) assert isinstance(Xtransformed, dict) assert set(Xtransformed.keys()) == {"A", "B"} assert (Xtransformed["A"] == X).all().all() assert (Xtransformed["B"] == X).all().all() gpipeline = GraphPipeline( { "A": DebugPassThrough(debug=True), "B": DebugPassThrough(debug=True), "C": TransformToBlockManager() }, edges=[("A", "C"), ("B", "C")], no_concat_nodes={"C"}, ) Xtransformed = gpipeline.fit_transform(X) assert isinstance(Xtransformed, BlockManager) assert (Xtransformed["A"] == X).all().all() assert (Xtransformed["B"] == X).all().all()
def get_pipeline(): pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} , edges = [("pt1","pt3","pt4"),("pt2","pt3","pt4")] ) return pipeline
def test_graphpipeline_merging_node(): gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": DebugPassThrough(debug=True), }, edges=[("ColNum", "Pt"), ("ColCat", "Pt")], ) gpipeline.fit(dfX, y) pt = gpipeline.models["Pt"] assert pt._expected_columns == ["num1", "num2", "num3", "cat1", "cat2"] assert pt._expected_type == DataTypes.DataFrame assert pt._expected_nbcols == 5 dfX_transformed = gpipeline.transform(dfX) assert (dfX_transformed == dfX.loc[:, ["num1", "num2", "num3", "cat1", "cat2"]]).all().all() assert gpipeline.get_feature_names() == ["num1", "num2", "num3", "cat1", "cat2"] assert gpipeline.get_feature_names_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"] assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"] assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns) assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns) assert gpipeline.get_input_features_at_node("Pt") == ["num1", "num2", "num3", "cat1", "cat2"] # concatenation in the other oreder gpipeline = GraphPipeline( { "ColNum": ColumnsSelector(columns_to_use=["num1", "num2", "num3"]), "ColCat": ColumnsSelector(columns_to_use=["cat1", "cat2"]), "Pt": DebugPassThrough(debug=True), }, edges=[("ColCat", "Pt"), ("ColNum", "Pt")], ) gpipeline.fit(dfX, y) pt = gpipeline.models["Pt"] assert pt._expected_columns == ["cat1", "cat2", "num1", "num2", "num3"] # Concanteation in the order of the edges assert pt._expected_type == DataTypes.DataFrame assert pt._expected_nbcols == 5 assert gpipeline.get_feature_names() == ["cat1", "cat2", "num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColNum") == ["num1", "num2", "num3"] assert gpipeline.get_feature_names_at_node("ColCat") == ["cat1", "cat2"] assert gpipeline.get_input_features_at_node("ColNum") == list(dfX.columns) assert gpipeline.get_input_features_at_node("ColCat") == list(dfX.columns) assert gpipeline.get_input_features_at_node("Pt") == ["cat1", "cat2", "num1", "num2", "num3"] dfX_transformed = gpipeline.transform(dfX) assert (dfX_transformed == dfX.loc[:, ["cat1", "cat2", "num1", "num2", "num3"]]).all().all()
def test_approx_cross_validation_transformer(x_data_type, shuffle, graph_pipeline, with_groups): if graph_pipeline: estimator = GraphPipeline({"ptA": DebugPassThrough(), "ptB": DebugPassThrough()}, edges=[("ptA", "ptB")]) else: estimator = DebugPassThrough() X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] scoring = ["accuracy", "neg_log_loss"] ################## ### Score only ### ################## with pytest.raises(Exception): cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0) # shouldn't work since DebugPassThrough can't be scored ################# ### Transform ### ################# cv_res, Xhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, no_scoring=True ) assert type(Xhat) == type(X) assert cv_res is None assert Xhat.shape == X.shape if isinstance(X, pd.DataFrame): assert (Xhat.index == X.index).all() assert (Xhat.columns == X.columns).all() if isinstance(X, pd.DataFrame): assert np.abs(Xhat - X).max().max() <= 10 ** (10 - 10) else: assert np.max(np.abs(Xhat - X)) <= 10 ** (-10)
def test_GraphPipeline_add_nodes(): pipeline = get_pipeline() new_pipeline = pipeline.add_nodes({"pt0":DebugPassThrough(column_prefix="PT0_", debug=True)}, [("pt0", "pt1")]) assert isinstance(new_pipeline, GraphPipeline) Xres= new_pipeline.fit_transform(dfX, y) assert Xres.columns[0] == "PT4__PT3__PT1__PT0__text1" Xres2 = pipeline.fit_transform(dfX) assert Xres2.columns[0] == "PT4__PT3__PT1__text1" with pytest.raises(ValueError): pipeline.add_nodes({"pt1": DebugPassThrough(column_prefix="newPT1_", debug=True)}, new_edges=[("pt0", "pt1")])
def test_graphpipeline_fit_params(): gpipeline = GraphPipeline( {"A": DebugPassThrough(debug=True), "B": DebugPassThrough(debug=True), "C": DebugPassThrough(debug=True)}, edges=[("A", "B", "C")], ) gpipeline.fit(X, y) assert gpipeline.models["A"].fit_params == {} assert gpipeline.models["B"].fit_params == {} assert gpipeline.models["C"].fit_params == {} gpipeline.fit(X, y, A__fitparam_A="paramA") assert gpipeline.models["A"].fit_params == {"fitparam_A": "paramA"} assert gpipeline.models["B"].fit_params == {} assert gpipeline.models["C"].fit_params == {}
def test_approx_cross_validation_cv(approximate_cv): X, y = make_classification() cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123) estimator = DebugPassThrough() cv_res, yhat = cross_validation( estimator, X, y, groups=None, cv=cv, verbose=1, fit_params={}, return_predict=True, method="transform", no_scoring=True, stopping_round=None, stopping_threshold=None, approximate_cv=approximate_cv, ) assert cv_res is None assert yhat.ndim == 2 assert yhat.shape == X.shape
def test_graphpipeline_set_params(): gpipeline = GraphPipeline( {"A": PassThrough(), "B": PassThrough(), "C": DebugPassThrough(debug=True)}, edges=[("A", "B", "C")] ) assert gpipeline.models["C"].debug is True gpipeline.set_params(C__debug=False) assert gpipeline.models["C"].debug is False
def test_graphpipeline_passing_of_groups(): gpipeline = GraphPipeline({"A": TransformerFailNoGroups(), "B": DebugPassThrough(debug=True)}, edges=[("A", "B")]) with pytest.raises(ValueError): gpipeline.fit(X, y) groups = np.zeros(len(y)) gpipeline.fit(X, y, groups) # check that it didn't failed
def test_GraphPipeline_substitute_nodes(): pipeline = get_pipeline() new_pipeline = pipeline.substitute_nodes({"pt1": DebugPassThrough(column_prefix="newPT1_", debug=True)}) assert isinstance(new_pipeline, GraphPipeline) new_pipeline.fit(dfX, y) Xres = new_pipeline.transform(dfX) assert Xres.columns[0] == "PT4__PT3__newPT1__text1" with pytest.raises(NotFittedError): pipeline.transform(dfX) Xres2 = pipeline.fit_transform(dfX) assert Xres2.columns[0] == "PT4__PT3__PT1__text1" with pytest.raises(ValueError): pipeline.substitute_nodes({"doesntexist": DebugPassThrough(column_prefix="newPT1_", debug=True)}) with pytest.raises(ValueError): pipeline.substitute_nodes({"pt1": "this_is_not_a_model"})
def test_cross_val_predict(): X, y = make_classification(n_samples=100) X = pd.DataFrame(X, columns=["col_%d" % i for i in range(X.shape[1])]) ii = np.arange(X.shape[0]) np.random.seed(123) np.random.shuffle(ii) pt = DebugPassThrough() cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123) Xhat = cross_val_predict(pt, X, y, cv=cv, method="transform") assert type(Xhat) == type(X) # Fail : cross_val_predict change the type
def test_fit_and_predict_transfrom(): X, y = make_classification(n_samples=100) X = pd.DataFrame(X, columns=["col_%d" % i for i in range(X.shape[1])]) cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123) for train, test in cv.split(X, y): pt = DebugPassThrough() predictions, _ = sklearn.model_selection._validation._fit_and_predict( pt, X, y, train, test, verbose=1, fit_params=None, method="transform" ) assert predictions.shape[0] == test.shape[0] assert predictions.shape[1] == X.shape[1] assert type(predictions) == type(X)
def test_score_from_params(x_data_type, shuffle, graph_pipeline): np.random.seed(123) X = np.random.randn(100, 10) X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: ii = np.arange(X.shape[0]) np.random.shuffle(ii) if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] scoring = ["silhouette", "davies_bouldin", "calinski_harabasz"] if graph_pipeline: estimator = GraphPipeline( {"pt": DebugPassThrough(), "lg": KMeans(n_clusters=3, random_state=123)}, edges=[("pt", "lg")] ) else: estimator = KMeans(n_clusters=3, random_state=123) ################## ### Only score ### ################## res = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0) assert isinstance(res, pd.DataFrame) assert res.shape[0] == 1 for s in scoring: assert ("test_" + s) in set(res.columns) with pytest.raises(NotFittedError): estimator.predict(X) ########################## ### Score + Prediction ### ########################## res, label = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0, return_predict=True) assert isinstance(res, pd.DataFrame) assert res.shape[0] == 1 for s in scoring: assert ("test_" + s) in set(res.columns) assert isinstance(label, np.ndarray) assert len(np.unique(label)) == 3 with pytest.raises(NotFittedError): estimator.predict(X) #################### ### Predict only ### #################### res, label = score_from_params_clustering( estimator, X, scoring=scoring, verbose=0, return_predict=True, no_scoring=True ) assert len(np.unique(label)) == 3 assert res is None with pytest.raises(NotFittedError): estimator.predict(X)
def test_approx_cross_validation_early_stop( add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups ): X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None if add_third_class: y[0:2] = 2 X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] if y_string_class: y = np.array(["CL_%d" % i for i in y]) if add_third_class: scoring = ["accuracy"] else: scoring = ["accuracy", "neg_log_loss"] if graph_pipeline: estimator = GraphPipeline( {"pt": DebugPassThrough(), "lg": LogisticRegression(C=1, random_state=123)}, edges=[("pt", "lg")] ) else: estimator = LogisticRegression(C=1, random_state=123) cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", stopping_round=1, stopping_threshold=1.01, # So that accuracy is sure to be bellow ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 2 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat is None cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", stopping_round=1, stopping_threshold=0.0, ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0
def test_cross_validation(add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups): X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if add_third_class: y[0:2] = 2 if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] if y_string_class: y = np.array(["CL_%d" % i for i in y]) if add_third_class: scoring = ["accuracy"] else: scoring = ["accuracy", "neg_log_loss"] if graph_pipeline: estimator = GraphPipeline({"pt": DebugPassThrough(), "lg": LogisticRegression()}, edges=[("pt", "lg")]) else: estimator = LogisticRegression() ################## ### Only score ### ################## cv_res = cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) with pytest.raises(NotFittedError): estimator.predict(X) ##################### ### Score + Proba ### ##################### cv_res, yhat_proba = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert isinstance(yhat_proba, pd.DataFrame) if isinstance(X, pd.DataFrame): assert (yhat_proba.index == X.index).all() assert yhat_proba.shape == (y.shape[0], 2 + 1 * add_third_class) assert yhat_proba.min().min() >= 0 assert yhat_proba.max().max() <= 1 assert list(yhat_proba.columns) == list(np.sort(np.unique(y))) with pytest.raises(NotFittedError): estimator.predict(X) ####################### ### Score + Predict ### ####################### cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict" ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0 assert yhat.shape[0] == y.shape[0] with pytest.raises(NotFittedError): estimator.predict(X) #################### ### Predict only ### #################### cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", no_scoring=True, ) assert yhat.shape[0] == y.shape[0] assert cv_res is None assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0 with pytest.raises(NotFittedError): estimator.predict(X)
def test_graphpipeline_blockselector(): Xnum, y = make_classification(n_samples=100) dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)}) X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) graphpipeline.fit(X, y) yhat = graphpipeline.predict(X) assert yhat.ndim == 1 assert yhat.shape[0] == y.shape[0] ### X = dico ### X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = list X = [dfX_text, Xnum] graphpipeline = GraphPipeline( models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = DataManager X = BlockManager({"text": dfX_text, "num": Xnum}) graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all()
def test_graphpipeline_nodes_concat_order(): cols = list(dfX.columns) ### 1 pipeline = GraphPipeline( { "pt1": DebugPassThrough(column_prefix="PT1_", debug=True), "pt2": DebugPassThrough(column_prefix="PT2_", debug=True), "pt3": DebugPassThrough(column_prefix="PT3_", debug=True), }, edges=[("pt1", "pt3"), ("pt2", "pt3")], ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT3__PT1__" + c for c in cols] + [ "PT3__PT2__" + c for c in cols ] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 2 : reverse order pipeline = GraphPipeline( { "pt1": DebugPassThrough(column_prefix="PT1_", debug=True), "pt2": DebugPassThrough(column_prefix="PT2_", debug=True), "pt3": DebugPassThrough(column_prefix="PT3_", debug=True), }, edges=[("pt2", "pt3"), ("pt1", "pt3")], ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT3__PT2__" + c for c in cols] + [ "PT3__PT1__" + c for c in cols ] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 3 : with 4 nodes for edges in ([("pt1", "pt3", "pt4"), ("pt2", "pt3", "pt4")], [("pt1", "pt3", "pt4"), ("pt2", "pt3")]): pipeline = GraphPipeline( { "pt1": DebugPassThrough(column_prefix="PT1_", debug=True), "pt2": DebugPassThrough(column_prefix="PT2_", debug=True), "pt3": DebugPassThrough(column_prefix="PT3_", debug=True), "pt4": DebugPassThrough(column_prefix="PT4_", debug=True), }, edges=edges, ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT4__PT3__PT1__" + c for c in cols] + [ "PT4__PT3__PT2__" + c for c in cols ] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 4 : reverse order for edges in ([("pt2", "pt3", "pt4"), ("pt1", "pt3", "pt4")], [("pt2", "pt3", "pt4"), ("pt1", "pt3")]): pipeline = GraphPipeline( { "pt1": DebugPassThrough(column_prefix="PT1_", debug=True), "pt2": DebugPassThrough(column_prefix="PT2_", debug=True), "pt3": DebugPassThrough(column_prefix="PT3_", debug=True), "pt4": DebugPassThrough(column_prefix="PT4_", debug=True), }, edges=edges, ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT4__PT3__PT2__" + c for c in cols] + [ "PT4__PT3__PT1__" + c for c in cols ] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names()
def test_try_to_find_features_names(): list_of_words = ["aa bb", "bb bb cc", "dd aa cc", "ee"] vec = CountVectorizer() vec.fit_transform(list_of_words) assert try_to_find_features_names(vec) == ["aa", "bb", "cc", "dd", "ee"] pipe = Pipeline([("nothing", DebugPassThrough()), ("vec", CountVectorizer())]) pipe.fit_transform(list_of_words) assert try_to_find_features_names(pipe) == ["aa", "bb", "cc", "dd", "ee"] union = FeatureUnion(transformer_list=[( "bagword", CountVectorizer()), ("bagchar", CountVectorizer(analyzer="char"))]) union.fit_transform(list_of_words) assert try_to_find_features_names(union) == [ "bagword__aa", "bagword__bb", "bagword__cc", "bagword__dd", "bagword__ee", "bagchar__ ", "bagchar__a", "bagchar__b", "bagchar__c", "bagchar__d", "bagchar__e", ] pipe1 = Pipeline([("nothing", DebugPassThrough()), ("vec", CountVectorizer())]) pipe2 = Pipeline([("nothing", DebugPassThrough()), ("vec", CountVectorizer(analyzer="char"))]) union = FeatureUnion(transformer_list=[("bagword", pipe1), ("bagchar", pipe2)]) union.fit_transform(list_of_words) assert try_to_find_features_names(union) == [ "bagword__aa", "bagword__bb", "bagword__cc", "bagword__dd", "bagword__ee", "bagchar__ ", "bagchar__a", "bagchar__b", "bagchar__c", "bagchar__d", "bagchar__e", ] class DummyModelAcceptInputFeature(object): def get_feature_names(self, input_features=None): if input_features is None: return [0, 1, 2, 3] else: return input_features class DummyModelDontInputFeature(object): def get_feature_names(self): return [0, 1, 2, 3] class DummyModelDoesntHaveGetFeatures(object): pass m = DummyModelAcceptInputFeature() assert try_to_find_features_names(m) == [0, 1, 2, 3] assert try_to_find_features_names( m, input_features=["a", "b", "c", "d"]) == ["a", "b", "c", "d"] m = DummyModelDontInputFeature() assert try_to_find_features_names(m) == [0, 1, 2, 3] assert try_to_find_features_names(m, input_features=["a", "b", "c", "d"]) == [0, 1, 2, 3] m = DummyModelDoesntHaveGetFeatures() assert try_to_find_features_names(m) is None assert try_to_find_features_names(m, input_features=["a", "b", "c", "d" ]) is None