예제 #1
0
def test_RandomModelGenerator_default():

    dfX, y, auto_ml_config = get_automl_config()

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    # verif iterator
    for model in random_model_generator.iterator_default_models():

        assert isinstance(model, tuple)
        assert len(model) == 3
        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.node:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        model = sklearn_model_from_param(result["json_code"])
        assert hasattr(model, "fit")
예제 #2
0
def test_RandomModelGenerator_iterator(type_of_iterator, num_only):

    dfX, y, auto_ml_config = get_automl_config(num_only)

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    if type_of_iterator == "default":
        iterator = random_model_generator.iterator_default_models()

    elif type_of_iterator == "block_search":
        iterator = random_model_generator.iterate_block_search(
            random_order=False)

    elif type_of_iterator == "block_search_random":
        iterator = random_model_generator.iterate_block_search(
            random_order=True)

    assert hasattr(iterator, "__iter__")

    # verif iterator
    for model in iterator:

        assert isinstance(model, tuple)
        assert len(model) == 3
        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.node:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        model = sklearn_model_from_param(result["json_code"])
        assert hasattr(model, "fit")
예제 #3
0
def test_RandomModelGenerator_random(num_only, specific_hyper,
                                     only_random_forest):

    dfX, y, auto_ml_config = get_automl_config(num_only)

    if specific_hyper:
        auto_ml_config.specific_hyper = {
            ('Model', 'RandomForestClassifier'): {
                "n_estimators": [10, 20]
            }
        }

    if only_random_forest:
        auto_ml_config.filter_models(Model='RandomForestClassifier')

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    all_gen = []
    for _ in range(10):
        model = random_model_generator.draw_random_graph()
        all_gen.append(model)

        assert isinstance(model, tuple)
        assert len(model) == 3

        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.node:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        model = sklearn_model_from_param(result["json_code"])
        assert hasattr(model, "fit")

        rf_key = ('Model', ('Model', 'RandomForestClassifier'))
        if only_random_forest:
            assert rf_key in all_models_params

        if specific_hyper:
            if rf_key in all_models_params:
                assert all_models_params[rf_key]["n_estimators"] in (10, 20)

    if not only_random_forest:
        assert any([rf_key not in m[1] for m in all_gen
                    ])  # Check that RandomForest wasn't drawn every time

    ### re-draw them thing with other seed ###
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)
    all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs1, all_params1, all_blocks1 = zip(*all_gen)
    all_graphs2, all_params2, all_blocks2 = zip(*all_gen2)

    assert not _all_same(all_params1)
    assert not _all_same(all_graphs1)
    if not num_only:
        assert not _all_same(all_blocks1)  # only one block

    all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1]
    all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs2_node_edges
    assert all_params1 == all_params2
    assert all_blocks1 == all_blocks2

    ### re-draw by resetting generator ###
    random_model_generator.random_state = 123
    all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs3, all_params3, all_blocks3 = zip(*all_gen3)
    all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs3_node_edges
    assert all_params1 == all_params3
    assert all_blocks1 == all_blocks3

    ### Re-draw by passing a random sate
    random_state = check_random_state(123)
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=random_state)
    all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs4, all_params4, all_blocks4 = zip(*all_gen4)
    all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs4_node_edges
    assert all_params1 == all_params4
    assert all_blocks1 == all_blocks4
예제 #4
0
def test_RandomModelGenerator_random():

    dfX, y, auto_ml_config = get_automl_config()

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    all_gen = []
    for _ in range(10):
        model = random_model_generator.draw_random_graph()
        all_gen.append(model)

        assert isinstance(model, tuple)
        assert len(model) == 3

        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.node:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        model = sklearn_model_from_param(result["json_code"])
        assert hasattr(model, "fit")

    ### re-draw them thing with other seed ###
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)
    all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs1, all_params1, all_blocks1 = zip(*all_gen)
    all_graphs2, all_params2, all_blocks2 = zip(*all_gen2)

    assert not _all_same(all_params1)
    assert not _all_same(all_graphs1)
    assert not _all_same(all_blocks1)

    all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1]
    all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs2_node_edges
    assert all_params1 == all_params2
    assert all_blocks1 == all_blocks2

    ### re-draw by resetting generator ###
    random_model_generator.random_state = 123
    all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs3, all_params3, all_blocks3 = zip(*all_gen3)
    all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs3_node_edges
    assert all_params1 == all_params3
    assert all_blocks1 == all_blocks3

    ### Re-draw by passing a random sate
    random_state = check_random_state(123)
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=random_state)
    all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs4, all_params4, all_blocks4 = zip(*all_gen4)
    all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs4_node_edges
    assert all_params1 == all_params4
    assert all_blocks1 == all_blocks4
예제 #5
0
def test_create_graphical_representation():

    steps = OrderedDict(
        [
            (("TextPreprocessing", ("TextPreprocessing", "CountVectorizerWrapper")), TypeOfVariables.TEXT),
            (("DimensionReduction", ("DimensionReduction", "TruncatedSVDWrapper")), TypeOfVariables.TEXT),
            (("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), TypeOfVariables.CAT),
            (("CategoryImputer", ("CategoryImputer", "CatImputer")), TypeOfVariables.CAT),
            (("MissingValueImputer", ("MissingValueImputer", "NumImputer")), TypeOfVariables.NUM),
            (("FeatureExtraction", ("FeatureExtraction", "PolynomialExtractor")), TypeOfVariables.NUM),
            (("Scaling", ("Scaling", "StandardScaler")), (TypeOfVariables.CAT, TypeOfVariables.NUM)),
            (
                ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")),
                (TypeOfVariables.CAT, TypeOfVariables.NUM, TypeOfVariables.TEXT),
            ),
            (
                ("Model", ("Model", "LightGBMClassifier")),
                (TypeOfVariables.CAT, TypeOfVariables.NUM, TypeOfVariables.TEXT),
            ),
        ]
    )

    #    columns = {"TEXT":["txt1","txt2"],
    #               "CAT":["cat1","cat2","cat3"],
    #               "NUM":["num1","num2"]}

    #    params = {n:("param_%s" % n) for n,t in steps.items()}

    G, new_steps = create_graphical_representation(steps)

    assert isinstance(G, nx.DiGraph)
    assert len(new_steps) == 0

    # graphviz_modelgraph(G)

    expected_edges = [
        (
            ("TextPreprocessing", ("TextPreprocessing", "CountVectorizerWrapper")),
            ("DimensionReduction", ("DimensionReduction", "TruncatedSVDWrapper")),
        ),
        (
            ("DimensionReduction", ("DimensionReduction", "TruncatedSVDWrapper")),
            ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")),
        ),
        (
            ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")),
            ("CategoryImputer", ("CategoryImputer", "CatImputer")),
        ),
        (("CategoryImputer", ("CategoryImputer", "CatImputer")), ("Scaling", ("Scaling", "StandardScaler"))),
        (
            ("MissingValueImputer", ("MissingValueImputer", "NumImputer")),
            ("FeatureExtraction", ("FeatureExtraction", "PolynomialExtractor")),
        ),
        (
            ("FeatureExtraction", ("FeatureExtraction", "PolynomialExtractor")),
            ("Scaling", ("Scaling", "StandardScaler")),
        ),
        (
            ("Scaling", ("Scaling", "StandardScaler")),
            ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")),
        ),
        (
            ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")),
            ("Model", ("Model", "LightGBMClassifier")),
        ),
    ]

    expected_nodes = [
        ("TextPreprocessing", ("TextPreprocessing", "CountVectorizerWrapper")),
        ("DimensionReduction", ("DimensionReduction", "TruncatedSVDWrapper")),
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")),
        ("CategoryImputer", ("CategoryImputer", "CatImputer")),
        ("MissingValueImputer", ("MissingValueImputer", "NumImputer")),
        ("FeatureExtraction", ("FeatureExtraction", "PolynomialExtractor")),
        ("Scaling", ("Scaling", "StandardScaler")),
        ("FeatureSelection", ("FeatureSelection", "FeaturesSelectorClassifier")),
        ("Model", ("Model", "LightGBMClassifier")),
    ]

    assert set(expected_edges) == set(G.edges)
    assert set(expected_nodes) == set(G.nodes)

    params = {}
    for n, _ in steps.items():
        params[n] = {"__%s_%s__" % n[1]: "param"}

    res1 = convert_graph_to_code(G, params)
    assert isinstance(res1, tuple)
    assert res1[0] == "GraphPipeline"
    assert isinstance(res1[1], dict)
    assert "edges" in res1[1]
    assert "models" in res1[1]
예제 #6
0
def test_convert_graph_to_code():

    ###################################
    ### ** Only one Simple Model ** ###
    ###################################

    Graph = nx.DiGraph()
    Graph.add_node(("Model", ("Model", "LogisticRegression")))

    assert _find_first_composition_node(Graph) is None

    ## a) no params
    all_models_params = {("Model", ("Model", "LogisticRegression")): {}}
    model_json_code = convert_graph_to_code(Graph, all_models_params)

    assert model_json_code == ("LogisticRegression", {})

    ## b) params
    all_models_params = {("Model", ("Model", "LogisticRegression")): {"C": 10}}
    model_json_code = convert_graph_to_code(Graph, all_models_params)

    assert model_json_code == ("LogisticRegression", {"C": 10})

    #####################
    ### ** 2 steps ** ###
    #####################
    Graph = nx.DiGraph()
    Graph.add_edge(
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")),
        ("Model", ("Model", "RandomForestClassifier")),
    )

    assert _find_first_composition_node(Graph) is None

    ## a) no params
    all_models_params = {
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {},
        ("Model", ("Model", "RandomForestClassifier")): {},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    assert model_json_code == (
        "GraphPipeline",
        {
            "edges": [("KMeansTransformer", "RandomForestClassifier")],
            "models": {
                "KMeansTransformer": ("KMeansTransformer", {}),
                "RandomForestClassifier": ("RandomForestClassifier", {}),
            },
        },
    )

    ## b) no params
    all_models_params = {
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {"n_clusters": 5},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    assert model_json_code == (
        "GraphPipeline",
        {
            "edges": [("KMeansTransformer", "RandomForestClassifier")],
            "models": {
                "KMeansTransformer": ("KMeansTransformer", {"n_clusters": 5}),
                "RandomForestClassifier": ("RandomForestClassifier", {"n_estimators": 100}),
            },
        },
    )

    ################################
    ### ** 1 composition step ** ###
    ################################

    ## a) no params
    Graph = nx.DiGraph()
    Graph.add_edge(
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")),
        ("Model", ("Model", "RandomForestClassifier")),
    )

    assert _find_first_composition_node(Graph) == (
        "TargetTransformer",
        ("TargetTransformer", "BoxCoxTargetTransformer"),
    )

    all_models_params = {
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {},
        ("Model", ("Model", "RandomForestClassifier")): {},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)
    expected_json_code = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}), {})

    assert model_json_code == expected_json_code

    ## b) params

    Graph = nx.DiGraph()
    Graph.add_edge(
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")),
        ("Model", ("Model", "RandomForestClassifier")),
    )

    assert _find_first_composition_node(Graph) == (
        "TargetTransformer",
        ("TargetTransformer", "BoxCoxTargetTransformer"),
    )

    all_models_params = {
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {"ll": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 10},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)
    expected_json_code = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {"n_estimators": 10}), {"ll": 10})

    assert model_json_code == expected_json_code

    ##########################################
    ## ** 1 composition above a pipeline ** ##
    ##########################################

    ## a) no param
    Graph = nx.DiGraph()
    Graph.add_edge(
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")),
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")),
    )

    Graph.add_edge(
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")),
        ("Model", ("Model", "RandomForestClassifier")),
    )

    assert _find_first_composition_node(Graph) == (
        "TargetTransformer",
        ("TargetTransformer", "BoxCoxTargetTransformer"),
    )

    all_models_params = {
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {},
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {},
        ("Model", ("Model", "RandomForestClassifier")): {},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    expected_json_code = (
        "BoxCoxTargetTransformer",
        (
            "GraphPipeline",
            {
                "edges": [("KMeansTransformer", "RandomForestClassifier")],
                "models": {
                    "KMeansTransformer": ("KMeansTransformer", {}),
                    "RandomForestClassifier": ("RandomForestClassifier", {}),
                },
            },
        ),
        {},
    )
    assert model_json_code == expected_json_code

    ## b) params
    Graph = nx.DiGraph()
    Graph.add_edge(
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")),
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")),
    )

    Graph.add_edge(
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")),
        ("Model", ("Model", "RandomForestClassifier")),
    )

    all_models_params = {
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {"ll": 10},
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {"n_clusters": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 10},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    expected_json_code = (
        "BoxCoxTargetTransformer",
        (
            "GraphPipeline",
            {
                "edges": [("KMeansTransformer", "RandomForestClassifier")],
                "models": {
                    "KMeansTransformer": ("KMeansTransformer", {"n_clusters": 10}),
                    "RandomForestClassifier": ("RandomForestClassifier", {"n_estimators": 10}),
                },
            },
        ),
        {"ll": 10},
    )

    assert model_json_code == expected_json_code

    #########################################################
    ## ** 1 composition node in the middle of the Graph ** ##
    #########################################################
    Graph = nx.DiGraph()

    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")),
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")),
    )
    Graph.add_edge(
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")),
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")),
    )

    Graph.add_edge(
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")),
        ("Model", ("Model", "RandomForestClassifier")),
    )

    all_models_params = {
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {},
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")): {"ll": 10},
        ("DimensionReduction", ("DimensionReduction", "KMeansTransformer")): {"n_clusters": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 10},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    expected_json_code = (
        "GraphPipeline",
        {
            "edges": [("NumericalEncoder", "BoxCoxTargetTransformer")],
            "models": {
                "BoxCoxTargetTransformer": (
                    "BoxCoxTargetTransformer",
                    (
                        "GraphPipeline",
                        {
                            "edges": [("KMeansTransformer", "RandomForestClassifier")],
                            "models": {
                                "KMeansTransformer": ("KMeansTransformer", {"n_clusters": 10}),
                                "RandomForestClassifier": ("RandomForestClassifier", {"n_estimators": 10}),
                            },
                        },
                    ),
                    {"ll": 10},
                ),
                "NumericalEncoder": ("NumericalEncoder", {}),
            },
        },
    )

    assert model_json_code == expected_json_code

    ###################################################
    ## ** 1 composition with several nodes bellow ** ##
    ###################################################
    Graph = nx.DiGraph()
    # TODO : try to do a stacking with several things bellow
    Graph.add_edge(
        ("Stacking", ("Stacking", "StackingClassifierRegressor")), ("Model", ("Model", "RandomForestClassifier"))
    )
    Graph.add_edge(
        ("Stacking", ("Stacking", "StackingClassifierRegressor")), ("Model", ("Model", "LogisticRegression"))
    )

    assert _find_first_composition_node(Graph) == ("Stacking", ("Stacking", "StackingClassifierRegressor"))

    # Rmk : Blending specification is missing, BUT it is enough to test the function
    all_models_params = {
        ("Stacking", ("Stacking", "StackingClassifierRegressor")): {"cv": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100},
        ("Model", ("Model", "LogisticRegression")): {"C": 10},
    }

    with pytest.raises(ValueError):
        model_json_code = convert_graph_to_code(Graph, all_models_params)
        # Unsuported for now : more than one terminal node

    model_json_code = convert_graph_to_code(Graph, all_models_params, _check_structure=False)

    expected_json_code1 = (
        "StackingClassifierRegressor",
        [("RandomForestClassifier", {"n_estimators": 100}), ("LogisticRegression", {"C": 10})],
        {"cv": 10},
    )
    #    expected_json_code2 = (
    #        "StackingClassifierRegressor",
    #        [("LogisticRegression", {"C": 10}), ("RandomForestClassifier", {"n_estimators": 100})],
    #        {"cv": 10},
    #    )
    assert expected_json_code1 == model_json_code  # or (expected_json_code2 == model_json_code)

    #######################################
    ## ** 2 nested compositions steps ** ##
    #######################################

    Graph = nx.DiGraph()
    Graph.add_edge(
        ("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer")),
        ("UnderOverSampler", ("UnderOverSampler", "TargetUnderSampler")),
    )

    Graph.add_edge(
        ("UnderOverSampler", ("UnderOverSampler", "TargetUnderSampler")), ("Model", ("Model", "RandomForestClassifier"))
    )

    all_models_params = {}
    all_models_params[("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer"))] = {"ll": 10}
    all_models_params[("Model", ("Model", "RandomForestClassifier"))] = {"n_estimators": 100}
    all_models_params[("UnderOverSampler", ("UnderOverSampler", "TargetUnderSampler"))] = {"target_ratio": "balanced"}

    assert _find_first_composition_node(Graph) == (
        "TargetTransformer",
        ("TargetTransformer", "BoxCoxTargetTransformer"),
    )

    assert _find_first_composition_node(
        Graph, composition_already_done={("TargetTransformer", ("TargetTransformer", "BoxCoxTargetTransformer"))}
    ) == ("UnderOverSampler", ("UnderOverSampler", "TargetUnderSampler"))

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    expected_json_code = (
        "BoxCoxTargetTransformer",
        ("TargetUnderSampler", ("RandomForestClassifier", {"n_estimators": 100}), {"target_ratio": "balanced"}),
        {"ll": 10},
    )

    assert model_json_code == expected_json_code

    ###################################################
    ## ** 1 composition with several nodes bellow ** ##
    ###################################################
    Graph = nx.DiGraph()
    # TODO : essayer de faire un stacking avec plusieurs trucs en dessous

    ## 1) with one node above

    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")),
        ("Stacking", ("Stacking", "StackingClassifierRegressor")),
    )
    Graph.add_edge(
        ("Stacking", ("Stacking", "StackingClassifierRegressor")), ("Model", ("Model", "RandomForestClassifier"))
    )
    Graph.add_edge(
        ("Stacking", ("Stacking", "StackingClassifierRegressor")), ("Model", ("Model", "LogisticRegression"))
    )

    all_models_params = {
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {},
        ("Stacking", ("Stacking", "StackingClassifierRegressor")): {"cv": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100},
        ("Model", ("Model", "LogisticRegression")): {"C": 10},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params, _check_structure=False)
    expected_json_code = (
        "GraphPipeline",
        {
            "edges": [("NumericalEncoder", "StackingClassifierRegressor")],
            "models": {
                "NumericalEncoder": ("NumericalEncoder", {}),
                "StackingClassifierRegressor": (
                    "StackingClassifierRegressor",
                    [("RandomForestClassifier", {"n_estimators": 100}), ("LogisticRegression", {"C": 10})],
                    {"cv": 10},
                ),
            },
        },
    )

    # Rmk : the Stacker is missing the blender, that I can't enter into the graph..
    assert expected_json_code == model_json_code

    ### With a node above, and a blender bellow
    Graph = nx.DiGraph()
    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking", "OutSampler"))
    )
    Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "RandomForestClassifier")))
    Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "LogisticRegression")))

    Graph.add_edge(("Model", ("Model", "LogisticRegression")), ("Blender", ("Blender", "LogisticRegression")))

    Graph.add_edge(("Model", ("Model", "RandomForestClassifier")), ("Blender", ("Blender", "LogisticRegression")))

    all_models_params = {
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {},
        ("Stacking", ("Stacking", "OutSampler")): {"cv": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100},
        ("Model", ("Model", "LogisticRegression")): {"C": 10},
        ("Blender", ("Blender", "LogisticRegression")): {"C": 100},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    expected_json = (
        "GraphPipeline",
        {
            "edges": [("NumericalEncoder", "OutSampler", "Blender_LogisticRegression")],
            "models": {
                "Blender_LogisticRegression": ("LogisticRegression", {"C": 100}),
                "NumericalEncoder": ("NumericalEncoder", {}),
                "OutSampler": (
                    "OutSampler",
                    [("RandomForestClassifier", {"n_estimators": 100}), ("LogisticRegression", {"C": 10})],
                    {"cv": 10},
                ),
            },
        },
    )

    assert expected_json == model_json_code

    ### With encoder feature going back into the Blender
    Graph = nx.DiGraph()
    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking", "OutSampler"))
    )
    Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "RandomForestClassifier")))
    Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "LogisticRegression")))

    Graph.add_edge(("Model", ("Model", "LogisticRegression")), ("Blender", ("Blender", "LogisticRegression")))

    Graph.add_edge(("Model", ("Model", "RandomForestClassifier")), ("Blender", ("Blender", "LogisticRegression")))

    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Blender", ("Blender", "LogisticRegression"))
    )

    all_models_params = {
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {},
        ("Stacking", ("Stacking", "OutSampler")): {"cv": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100},
        ("Model", ("Model", "LogisticRegression")): {"C": 10},
        ("Blender", ("Blender", "LogisticRegression")): {"C": 100},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    expected_json = (
        "GraphPipeline",
        {
            "edges": [
                ("NumericalEncoder", "Blender_LogisticRegression"),
                ("NumericalEncoder", "OutSampler", "Blender_LogisticRegression"),
            ],
            "models": {
                "Blender_LogisticRegression": ("LogisticRegression", {"C": 100}),
                "NumericalEncoder": ("NumericalEncoder", {}),
                "OutSampler": (
                    "OutSampler",
                    [("RandomForestClassifier", {"n_estimators": 100}), ("LogisticRegression", {"C": 10})],
                    {"cv": 10},
                ),
            },
        },
    )

    assert expected_json == model_json_code

    # Same thing but with 2 OutSampler (one per model)
    Graph = nx.DiGraph()
    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking1", "OutSampler"))
    )
    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking2", "OutSampler"))
    )
    Graph.add_edge(("Stacking", ("Stacking1", "OutSampler")), ("Model", ("Model", "RandomForestClassifier")))
    Graph.add_edge(("Stacking", ("Stacking2", "OutSampler")), ("Model", ("Model", "LogisticRegression")))

    Graph.add_edge(("Model", ("Model", "LogisticRegression")), ("Blender", ("Blender", "LogisticRegression")))

    Graph.add_edge(("Model", ("Model", "RandomForestClassifier")), ("Blender", ("Blender", "LogisticRegression")))

    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Blender", ("Blender", "LogisticRegression"))
    )

    all_models_params = {
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {},
        ("Stacking", ("Stacking1", "OutSampler")): {"cv": 10},
        ("Stacking", ("Stacking2", "OutSampler")): {"cv": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100},
        ("Model", ("Model", "LogisticRegression")): {"C": 10},
        ("Blender", ("Blender", "LogisticRegression")): {"C": 100},
    }

    model_json_code = convert_graph_to_code(Graph, all_models_params)

    expected_json = (
        "GraphPipeline",
        {
            "edges": [
                ("NumericalEncoder", "Blender_LogisticRegression"),
                ("NumericalEncoder", "Stacking1_OutSampler", "Blender_LogisticRegression"),
                ("NumericalEncoder", "Stacking2_OutSampler", "Blender_LogisticRegression"),
            ],
            "models": {
                "Blender_LogisticRegression": ("LogisticRegression", {"C": 100}),
                "NumericalEncoder": ("NumericalEncoder", {}),
                "Stacking1_OutSampler": ("OutSampler", ("RandomForestClassifier", {"n_estimators": 100}), {"cv": 10}),
                "Stacking2_OutSampler": ("OutSampler", ("LogisticRegression", {"C": 10}), {"cv": 10}),
            },
        },
    )

    assert expected_json == model_json_code

    ### Multi output ###
    Graph = nx.DiGraph()
    Graph.add_node(("Model", ("Model", "LogisticRegression")))
    Graph.add_node(("Model", ("Model", "RandomForestClassifier")))

    all_models_params = {
        ("Model", ("Model", "LogisticRegression")): {"C": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100},
    }

    assert _find_first_composition_node(Graph) is None

    model_json_code = convert_graph_to_code(Graph, all_models_params, _check_structure=False)
    expected_json = (
        "GraphPipeline",
        {
            "edges": [("LogisticRegression",), ("RandomForestClassifier",)],
            "models": {
                "LogisticRegression": ("LogisticRegression", {"C": 10}),
                "RandomForestClassifier": ("RandomForestClassifier", {"n_estimators": 100}),
            },
        },
    )

    assert expected_json == model_json_code

    ### Impossible graph ###
    Graph = nx.DiGraph()
    Graph.add_edge(
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")), ("Stacking", ("Stacking", "OutSampler"))
    )

    Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "RandomForestClassifier")))

    Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "LogisticRegression")))

    Graph.add_edge(("Stacking", ("Stacking", "OutSampler")), ("Model", ("Model", "ExtraTreesClassifier")))
    # This edge make it impossible : it comes from the composition node ...
    # but doesn't have the same child as the other

    Graph.add_edge(("Model", ("Model", "LogisticRegression")), ("Blender", ("Blender", "LogisticRegression")))

    Graph.add_edge(("Model", ("Model", "RandomForestClassifier")), ("Blender", ("Blender", "LogisticRegression")))

    #    graphviz_modelgraph(Graph)

    all_models_params = {
        ("CategoryEncoder", ("CategoryEncoder", "NumericalEncoder")): {},
        ("Stacking", ("Stacking", "OutSampler")): {"cv": 10},
        ("Model", ("Model", "RandomForestClassifier")): {"n_estimators": 100},
        ("Model", ("Model", "ExtraTreesClassifier")): {"n_estimators": 200},
        ("Model", ("Model", "LogisticRegression")): {"C": 10},
        ("Blender", ("Blender", "LogisticRegression")): {"C": 100},
    }

    with pytest.raises(ValueError):
        model_json_code = convert_graph_to_code(Graph, all_models_params, _check_structure=False)
예제 #7
0
def test_RandomModelGenerator_random(num_only, specific_hyper,
                                     only_random_forest):

    #num_only, specific_hyper, only_random_forest = False, True, True
    dfX, y, auto_ml_config = get_automl_config(num_only)

    if specific_hyper:
        auto_ml_config.specific_hyper = {
            ("Model", "RandomForestClassifier"): {
                "n_estimators": [10, 20]
            }
        }

    if only_random_forest:
        auto_ml_config.filter_models(Model="RandomForestClassifier")

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    all_gen = []
    for _ in range(10):
        model = random_model_generator.draw_random_graph()
        all_gen.append(model)

        assert isinstance(model, tuple)
        assert len(model) == 3

        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.nodes:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        sk_model = sklearn_model_from_param(result["json_code"])
        assert hasattr(sk_model, "fit")

        rf_key = ("Model", ("Model", "RandomForestClassifier"))
        if only_random_forest:
            assert rf_key in all_models_params

        if specific_hyper:
            if rf_key in all_models_params:
                assert all_models_params[rf_key]["n_estimators"] in (10, 20)

        if ('Model', ('Model', 'RandomForestClassifier')) in Graph.nodes:
            # in that case I'll actually do the fitting here
            # I'll simplify the model to have 2 estimators (faster)
            all_models_params_copy = deepcopy(all_models_params)
            all_models_params_copy[('Model', (
                'Model', 'RandomForestClassifier'))]["n_estimators"] = 2
            result = convert_graph_to_code(Graph,
                                           all_models_params_copy,
                                           also_returns_mapping=True)
            sk_model = sklearn_model_from_param(result["json_code"])

            sub_index = np.concatenate(
                (np.where(y == 0)[0][0:100], np.where(y == 1)[0][0:100]),
                axis=0)
            # Needs at least 20 observations to make sure all transformers works
            if hasattr(sk_model, "verbose"):
                sk_model.verbose = True
            sk_model.fit(dfX.iloc[sub_index, :], y[sub_index])

            yhat = sk_model.predict(dfX.head(2))
            assert yhat.shape == (2, )

    if not only_random_forest:
        assert any([rf_key not in m[1] for m in all_gen
                    ])  # Check that RandomForest wasn't drawn every time

    ### re-draw them thing with other seed ###
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)
    all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs1, all_params1, all_blocks1 = zip(*all_gen)
    all_graphs2, all_params2, all_blocks2 = zip(*all_gen2)

    assert not _all_same(all_params1)
    assert not _all_same(all_graphs1)
    if not num_only:
        assert not _all_same(all_blocks1)  # only one block

    all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1]
    all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs2_node_edges
    assert all_params1 == all_params2
    assert all_blocks1 == all_blocks2

    ### re-draw by resetting generator ###
    random_model_generator.random_state = 123
    all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs3, all_params3, all_blocks3 = zip(*all_gen3)
    all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs3_node_edges
    assert all_params1 == all_params3
    assert all_blocks1 == all_blocks3

    ### Re-draw by passing a random sate
    random_state = check_random_state(123)
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=random_state)
    all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs4, all_params4, all_blocks4 = zip(*all_gen4)
    all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs4_node_edges
    assert all_params1 == all_params4
    assert all_blocks1 == all_blocks4
예제 #8
0
def test_RandomModelGenerator_iterator(type_of_iterator, num_only):

    dfX, y, auto_ml_config = get_automl_config(num_only)

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    if type_of_iterator == "default":
        iterator = random_model_generator.iterator_default_models()

    elif type_of_iterator == "block_search":
        iterator = random_model_generator.iterate_block_search(
            random_order=False)

    elif type_of_iterator == "block_search_random":
        iterator = random_model_generator.iterate_block_search(
            random_order=True)

    assert hasattr(iterator, "__iter__")

    # verif iterator
    for model in iterator:

        assert isinstance(model, tuple)
        assert len(model) == 3
        Graph, all_models_params, block_to_use = model

        terminal_nodes = get_terminal_nodes(Graph)
        assert len(terminal_nodes) == 1
        assert terminal_nodes[0][0] == StepCategories.Model

        #graphviz_graph(Graph)

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.nodes:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        sk_model = sklearn_model_from_param(result["json_code"])
        assert hasattr(sk_model, "fit")

        if type_of_iterator == "default" and ('Model', (
                'Model', 'RandomForestClassifier')) in Graph.nodes:
            # in that case I'll actually do the fitting here
            # I'll simplify the model to have 2 estimators (faster)

            all_models_params[('Model',
                               ('Model',
                                'RandomForestClassifier'))]["n_estimators"] = 2
            result = convert_graph_to_code(Graph,
                                           all_models_params,
                                           also_returns_mapping=True)
            sk_model = sklearn_model_from_param(result["json_code"])

            sub_index = np.concatenate(
                (np.where(y == 0)[0][0:10], np.where(y == 1)[0][0:10]), axis=0)
            # Needs at least 20 observations to make sure all transformers works
            sk_model.fit(dfX.iloc[sub_index, :], y[sub_index])

            yhat = sk_model.predict(dfX.head(2))
            assert yhat.shape == (2, )