def test_boxcox_target_transformer(self):

        ## syntax 1 ##

        param = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}))

        param_c = copy.deepcopy(param)

        model = sklearn_model_from_param(param_c)
        assert isinstance(model, BoxCoxTargetTransformer)
        assert isinstance(model.model, RandomForestClassifier)
        assert param == param_c
        param_reverse = param_from_sklearn_model(
            model
        )  # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute
        assert param_reverse[0] == param[0]

        ## syntax 2 ##
        params = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}), {
            "ll": 10
        })

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)
        assert isinstance(model, BoxCoxTargetTransformer)
        assert isinstance(model.model, RandomForestClassifier)
        assert model.ll == 10
        assert params == params_c
        param_reverse = param_from_sklearn_model(
            model
        )  # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute

        assert param_reverse[0] == param[0]

        ## syntax 3 ##
        params = ("BoxCoxTargetTransformer", {
            "model": ("RandomForestClassifier", {}),
            "ll": 10
        })

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)

        assert isinstance(model, BoxCoxTargetTransformer)
        assert isinstance(model.model, RandomForestClassifier)
        assert model.ll == 10
        assert params == params_c
        param_reverse = param_from_sklearn_model(
            model
        )  # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute
        assert param_reverse == params
Exemple #2
0
def test_RandomModelGenerator_default():

    dfX, y, auto_ml_config = get_automl_config()

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    # verif iterator
    for model in random_model_generator.iterator_default_models():

        assert isinstance(model, tuple)
        assert len(model) == 3
        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.node:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        model = sklearn_model_from_param(result["json_code"])
        assert hasattr(model, "fit")
    def test_graph_pipeline(self):
        #####################
        ### GraphPipeline ###
        #####################

        param = (
            "GraphPipeline",
            {
                "models": {
                    "svd": ("TruncatedSVDWrapper", {
                        "n_components": 3
                    }),
                    "logit": ("LogisticRegression", {
                        "C": 10
                    }),
                },
                "edges": [("svd", "logit")],
            },
        )

        param_c = copy.deepcopy(param)

        model = sklearn_model_from_param(param)

        assert isinstance(model, GraphPipeline)
        assert isinstance(model.models["logit"], LogisticRegression)
        assert isinstance(model.models["svd"], TruncatedSVDWrapper)
        assert model.models["svd"].n_components == 3

        assert param == param_c

        param_reverse = param_from_sklearn_model(model)
        assert param_reverse == param
Exemple #4
0
    def fit_command(self, job_ids):
        """ this command is to launch the final fit one (or more) model(s)
        It can be executed using the 'fit' command keyword followed by '--job_ids ***'
        
        It will:
            * reload the data
            * fit a model on all the data
            * save the pickled object

        """
        all_models = []
        for job_id in job_ids:
            print("fitting of job_id '%s'" % job_id)
            self.reload()
            
            job_param = self.data_persister.read(job_id, path = "job_param", write_type = SavingType.json)
            model = sklearn_model_from_param(job_param["model_json"])
            print("start fitting...")
            
            if function_has_named_argument(model.fit, "groups") and self.groups is not None:
                model.fit(self.dfX, self.y, groups=self.groups)
            else:
                model.fit(self.dfX, self.y)
                
            print("...model fitted!")
            
            self.data_persister.write(model, job_id, path="saved_models", write_type=SavingType.pickle)
            self.data_persister.write(job_param["model_json"], job_id, path="saved_models", write_type=SavingType.json)
            
            print("model persisted")
            
            all_models.append(model)

        return all_models
Exemple #5
0
    def test_graph_pipeline(self):
        #####################
        ### GraphPipeline ###
        #####################

        param3 = (
            "GraphPipeline",
            {
                "models": {
                    "svd": ("TruncatedSVDWrapper", {
                        "n_components": 2
                    }),
                    "logit": ("LogisticRegression", {
                        "C": 10
                    }),
                },
                "edges": [("svd", "logit")],
            },
        )

        param3_c = copy.deepcopy(param3)

        model3 = sklearn_model_from_param(param3)

        assert isinstance(model3, GraphPipeline)
        assert isinstance(model3.models["logit"], LogisticRegression)
        assert isinstance(model3.models["svd"], TruncatedSVDWrapper)

        assert param3 == param3_c
    def test_stacking_classifier(self):

        params = (
            "StackerClassifier",
            {
                "models": [("RandomForestClassifier", {}),
                           ("ExtraTreesClassifier", {})],
                "cv":
                5,
                "blender": ("LogisticRegression", {}),
            },
        )

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)

        assert isinstance(model, StackerClassifier)
        assert len(model.models) == 2
        assert isinstance(model.models[0], RandomForestClassifier)
        assert isinstance(model.models[1], ExtraTreesClassifier)
        assert isinstance(model.blender, LogisticRegression)
        assert model.cv == 5
        param_reverse = param_from_sklearn_model(
            model
        )  # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute

        assert param_reverse == params
    def test_graph_pipeline_list(self):
        #####################
        ### GraphPipeline ###
        #####################

        # Test when inputs are list and not tuples

        param = (
            "GraphPipeline",
            {
                "edges": [["encoder", "imputer", "rf"], ["vect", "svd", "rf"]],
                "models": {
                    "encoder": (
                        "NumericalEncoder",
                        {
                            "columns_to_use": [
                                "^BLOCK_", "^NUMBERTOKEN_", "^DATETOKEN_",
                                "^CURRENCYTOKEN_"
                            ],
                            "regex_match":
                            True,
                        },
                    ),
                    "imputer": ("NumImputer", {}),
                    "rf": ("RandomForestClassifier", {
                        "n_estimators": 500
                    }),
                    "svd": ("TruncatedSVDWrapper", {
                        "n_components": 200
                    }),
                    "vect": (
                        "CountVectorizerWrapper",
                        {
                            "analyzer": "char",
                            "columns_to_use":
                            ["STRINGLEFTOF", "STRINGABOVEOF"],
                            "ngram_range": [1, 4],
                        },
                    ),
                },
            },
        )

        param_c = copy.deepcopy(param)

        model = sklearn_model_from_param(param)

        assert isinstance(model, GraphPipeline)
        assert isinstance(model.models["encoder"], NumericalEncoder)
        assert isinstance(model.models["imputer"], NumImputer)
        assert isinstance(model.models["vect"], CountVectorizerWrapper)
        assert isinstance(model.models["svd"], TruncatedSVDWrapper)
        assert isinstance(model.models["rf"], RandomForestClassifier)

        assert param == param_c

        param_reverse = param_from_sklearn_model(model)
        assert param_reverse == param
Exemple #8
0
    def test_boxcox_target_transformer(self):

        ## syntax 1 ##

        params = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}))

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)
        assert isinstance(model, BoxCoxTargetTransformer)
        assert isinstance(model.model, RandomForestClassifier)
        assert params == params_c

        ## syntax 2 ##
        params = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}), {
            "ll": 10
        })

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)
        assert isinstance(model, BoxCoxTargetTransformer)
        assert isinstance(model.model, RandomForestClassifier)
        assert model.ll == 10
        assert params == params_c

        ## syntax 3 ##
        params = ("BoxCoxTargetTransformer", {
            "model": ("RandomForestClassifier", {}),
            "ll": 10
        })

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)

        assert isinstance(model, BoxCoxTargetTransformer)
        assert isinstance(model.model, RandomForestClassifier)
        assert model.ll == 10
        assert params == params_c
Exemple #9
0
    def test_logistic_regression(self):
        ###########################
        ### Logistic Regression ###
        ###########################
        from sklearn.linear_model import LogisticRegression

        param2 = ("LogisticRegression", {"C": 10})
        param2_c = copy.deepcopy(param2)

        model2 = sklearn_model_from_param(param2)

        assert isinstance(model2, LogisticRegression)
        assert model2.C == 10

        assert param2 == param2_c  # verif that param was not modified inside function
Exemple #10
0
def test_RandomModelGenerator_iterator(type_of_iterator, num_only):

    dfX, y, auto_ml_config = get_automl_config(num_only)

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    if type_of_iterator == "default":
        iterator = random_model_generator.iterator_default_models()

    elif type_of_iterator == "block_search":
        iterator = random_model_generator.iterate_block_search(
            random_order=False)

    elif type_of_iterator == "block_search_random":
        iterator = random_model_generator.iterate_block_search(
            random_order=True)

    assert hasattr(iterator, "__iter__")

    # verif iterator
    for model in iterator:

        assert isinstance(model, tuple)
        assert len(model) == 3
        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.node:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        model = sklearn_model_from_param(result["json_code"])
        assert hasattr(model, "fit")
Exemple #11
0
    def test_random_forest(self):
        #####################
        ### Random Forest ###
        #####################

        param1 = ("RandomForestClassifier", {
            "n_estimators": 100,
            "criterion": "entropy"
        })
        param1_c = copy.deepcopy(param1)

        model1 = sklearn_model_from_param(param1)

        assert isinstance(model1, RandomForestClassifier)
        assert model1.n_estimators == 100

        assert param1 == param1_c  # verif that param was not modified inside function
    def test_random_forest(self):
        #####################
        ### Random Forest ###
        #####################

        param = ("RandomForestClassifier", {
            "n_estimators": 150,
            "criterion": "entropy"
        })
        param_c = copy.deepcopy(param)

        model = sklearn_model_from_param(param)

        assert isinstance(model, RandomForestClassifier)
        assert model.n_estimators == 150

        assert param == param_c  # verif that param was not modified inside function

        param_reverse = param_from_sklearn_model(model)
        assert param_reverse == param
Exemple #13
0
    def test_stacking_classifier(self):

        params = (
            "StackerClassifier",
            {
                "models": [("RandomForestClassifier", {}),
                           ("ExtraTreesClassifier", {})],
                "cv":
                5,
                "blender": ("LogisticRegression", {}),
            },
        )

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)

        assert isinstance(model, StackerClassifier)
        assert len(model.models) == 2
        assert isinstance(model.models[0], RandomForestClassifier)
        assert isinstance(model.models[1], ExtraTreesClassifier)
        assert isinstance(model.blender, LogisticRegression)
        assert model.cv == 5
Exemple #14
0
    def boxcox_and_graphpipeline(self):

        params = (
            "GraphPipeline",
            {
                "edges": [("NumericalEncoder", "BoxCoxTargetTransformer")],
                "models": {
                    "BoxCoxTargetTransformer": (
                        "BoxCoxTargetTransformer",
                        (
                            "GraphPipeline",
                            {
                                "edges": [("KMeansTransformer",
                                           "RandomForestClassifier")],
                                "models": {
                                    "KMeansTransformer":
                                    ("KMeansTransformer", {
                                        "n_clusters": 10
                                    }),
                                    "RandomForestClassifier":
                                    ("RandomForestClassifier", {
                                        "n_estimators": 10
                                    }),
                                },
                            },
                        ),
                        {
                            "ll": 10
                        },
                    ),
                    "NumericalEncoder": ("NumericalEncoder", {}),
                },
            },
        )

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)

        assert isinstance(model, GraphPipeline)
        assert len(model.models) == 2

        assert "NumericalEncoder" in model.models
        assert isinstance(model.models["NumericalEncoder"], NumericalEncoder)

        assert "BoxCoxTargetTransformer" in model.models
        assert isinstance(model.models["BoxCoxTargetTransformer"],
                          BoxCoxTargetTransformer)

        assert isinstance(model.models["BoxCoxTargetTransformer"].model,
                          GraphPipeline)

        assert set(
            model.models["BoxCoxTargetTransformer"].model.models.keys()) == {
                "KMeansTransformer",
                "RandomForestClassifier",
            }

        assert isinstance(
            model.models["BoxCoxTargetTransformer"].model.
            models["KMeansTransformer"], KMeansTransformer)
        assert isinstance(
            model.models["BoxCoxTargetTransformer"].model.
            models["RandomForestClassifier"], RandomForestClassifier)

        assert params == params_c
Exemple #15
0
def test_RandomModelGenerator_random():

    dfX, y, auto_ml_config = get_automl_config()

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    all_gen = []
    for _ in range(10):
        model = random_model_generator.draw_random_graph()
        all_gen.append(model)

        assert isinstance(model, tuple)
        assert len(model) == 3

        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.node:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        model = sklearn_model_from_param(result["json_code"])
        assert hasattr(model, "fit")

    ### re-draw them thing with other seed ###
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)
    all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs1, all_params1, all_blocks1 = zip(*all_gen)
    all_graphs2, all_params2, all_blocks2 = zip(*all_gen2)

    assert not _all_same(all_params1)
    assert not _all_same(all_graphs1)
    assert not _all_same(all_blocks1)

    all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1]
    all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs2_node_edges
    assert all_params1 == all_params2
    assert all_blocks1 == all_blocks2

    ### re-draw by resetting generator ###
    random_model_generator.random_state = 123
    all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs3, all_params3, all_blocks3 = zip(*all_gen3)
    all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs3_node_edges
    assert all_params1 == all_params3
    assert all_blocks1 == all_blocks3

    ### Re-draw by passing a random sate
    random_state = check_random_state(123)
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=random_state)
    all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs4, all_params4, all_blocks4 = zip(*all_gen4)
    all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs4_node_edges
    assert all_params1 == all_params4
    assert all_blocks1 == all_blocks4
Exemple #16
0
 def _load_model(self, job_id):
     job = self.storage.load_special_json(job_id, 'jobs')
     return sklearn_model_from_param(job['model_code'])
Exemple #17
0
def test_RandomModelGenerator_iterator(type_of_iterator, num_only):

    dfX, y, auto_ml_config = get_automl_config(num_only)

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    if type_of_iterator == "default":
        iterator = random_model_generator.iterator_default_models()

    elif type_of_iterator == "block_search":
        iterator = random_model_generator.iterate_block_search(
            random_order=False)

    elif type_of_iterator == "block_search_random":
        iterator = random_model_generator.iterate_block_search(
            random_order=True)

    assert hasattr(iterator, "__iter__")

    # verif iterator
    for model in iterator:

        assert isinstance(model, tuple)
        assert len(model) == 3
        Graph, all_models_params, block_to_use = model

        terminal_nodes = get_terminal_nodes(Graph)
        assert len(terminal_nodes) == 1
        assert terminal_nodes[0][0] == StepCategories.Model

        #graphviz_graph(Graph)

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.nodes:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        sk_model = sklearn_model_from_param(result["json_code"])
        assert hasattr(sk_model, "fit")

        if type_of_iterator == "default" and ('Model', (
                'Model', 'RandomForestClassifier')) in Graph.nodes:
            # in that case I'll actually do the fitting here
            # I'll simplify the model to have 2 estimators (faster)

            all_models_params[('Model',
                               ('Model',
                                'RandomForestClassifier'))]["n_estimators"] = 2
            result = convert_graph_to_code(Graph,
                                           all_models_params,
                                           also_returns_mapping=True)
            sk_model = sklearn_model_from_param(result["json_code"])

            sub_index = np.concatenate(
                (np.where(y == 0)[0][0:10], np.where(y == 1)[0][0:10]), axis=0)
            # Needs at least 20 observations to make sure all transformers works
            sk_model.fit(dfX.iloc[sub_index, :], y[sub_index])

            yhat = sk_model.predict(dfX.head(2))
            assert yhat.shape == (2, )
Exemple #18
0
def test_RandomModelGenerator_random(num_only, specific_hyper,
                                     only_random_forest):

    #num_only, specific_hyper, only_random_forest = False, True, True
    dfX, y, auto_ml_config = get_automl_config(num_only)

    if specific_hyper:
        auto_ml_config.specific_hyper = {
            ("Model", "RandomForestClassifier"): {
                "n_estimators": [10, 20]
            }
        }

    if only_random_forest:
        auto_ml_config.filter_models(Model="RandomForestClassifier")

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    all_gen = []
    for _ in range(10):
        model = random_model_generator.draw_random_graph()
        all_gen.append(model)

        assert isinstance(model, tuple)
        assert len(model) == 3

        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.nodes:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        sk_model = sklearn_model_from_param(result["json_code"])
        assert hasattr(sk_model, "fit")

        rf_key = ("Model", ("Model", "RandomForestClassifier"))
        if only_random_forest:
            assert rf_key in all_models_params

        if specific_hyper:
            if rf_key in all_models_params:
                assert all_models_params[rf_key]["n_estimators"] in (10, 20)

        if ('Model', ('Model', 'RandomForestClassifier')) in Graph.nodes:
            # in that case I'll actually do the fitting here
            # I'll simplify the model to have 2 estimators (faster)
            all_models_params_copy = deepcopy(all_models_params)
            all_models_params_copy[('Model', (
                'Model', 'RandomForestClassifier'))]["n_estimators"] = 2
            result = convert_graph_to_code(Graph,
                                           all_models_params_copy,
                                           also_returns_mapping=True)
            sk_model = sklearn_model_from_param(result["json_code"])

            sub_index = np.concatenate(
                (np.where(y == 0)[0][0:100], np.where(y == 1)[0][0:100]),
                axis=0)
            # Needs at least 20 observations to make sure all transformers works
            if hasattr(sk_model, "verbose"):
                sk_model.verbose = True
            sk_model.fit(dfX.iloc[sub_index, :], y[sub_index])

            yhat = sk_model.predict(dfX.head(2))
            assert yhat.shape == (2, )

    if not only_random_forest:
        assert any([rf_key not in m[1] for m in all_gen
                    ])  # Check that RandomForest wasn't drawn every time

    ### re-draw them thing with other seed ###
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)
    all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs1, all_params1, all_blocks1 = zip(*all_gen)
    all_graphs2, all_params2, all_blocks2 = zip(*all_gen2)

    assert not _all_same(all_params1)
    assert not _all_same(all_graphs1)
    if not num_only:
        assert not _all_same(all_blocks1)  # only one block

    all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1]
    all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs2_node_edges
    assert all_params1 == all_params2
    assert all_blocks1 == all_blocks2

    ### re-draw by resetting generator ###
    random_model_generator.random_state = 123
    all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs3, all_params3, all_blocks3 = zip(*all_gen3)
    all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs3_node_edges
    assert all_params1 == all_params3
    assert all_blocks1 == all_blocks3

    ### Re-draw by passing a random sate
    random_state = check_random_state(123)
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=random_state)
    all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs4, all_params4, all_blocks4 = zip(*all_gen4)
    all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs4_node_edges
    assert all_params1 == all_params4
    assert all_blocks1 == all_blocks4
def test_param_from_sklearn_model():
    # simple RandomForest
    model = RandomForestClassifier(n_estimators=250)
    assert RandomForestClassifier().get_params()["n_estimators"] != 250
    assert param_from_sklearn_model(
        model, simplify_default=True) == ('RandomForestClassifier', {
            'n_estimators': 250
        })
    param = param_from_sklearn_model(model, simplify_default=False)
    assert isinstance(param, tuple)
    assert len(param) == 2
    assert param[0] == "RandomForestClassifier"

    assert isinstance(
        sklearn_model_from_param(param_from_sklearn_model(model)),
        model.__class__)
    s = json.dumps(param)  # check that it can be json serialized
    assert isinstance(s, str)

    assert isinstance(
        sklearn_model_from_param(param_from_sklearn_model(model)),
        model.__class__)

    # Composition model : BoxCoxTargetTransformer of RandomForestClassifier
    model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250),
                                    ll=0)
    param = param_from_sklearn_model(model, simplify_default=True)
    assert param == ('BoxCoxTargetTransformer', {
        'model': ('RandomForestClassifier', {
            'n_estimators': 250
        })
    })

    assert isinstance(
        sklearn_model_from_param(param_from_sklearn_model(model)),
        model.__class__)
    s = json.dumps(param)  # check that it can be json serialized
    assert isinstance(s, str)

    # Composition model : BoxCoxTargetTransformer of RandomForestClassifier
    model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250),
                                    ll=1)
    param = param_from_sklearn_model(model, simplify_default=True)
    assert param == ('BoxCoxTargetTransformer', {
        'll': 1,
        'model': ('RandomForestClassifier', {
            'n_estimators': 250
        })
    })
    s = json.dumps(param)  # check that it can be json serialized
    assert isinstance(s, str)

    assert isinstance(
        sklearn_model_from_param(param_from_sklearn_model(model)),
        model.__class__)

    # Pipeline
    model = Pipeline([("enc", NumericalEncoder()),
                      ("forest", RandomForestClassifier(n_estimators=250))])
    param = param_from_sklearn_model(model, simplify_default=True)
    assert param == ('Pipeline', {
        'steps': [('enc', ('NumericalEncoder', {})),
                  ('forest', ('RandomForestClassifier', {
                      'n_estimators': 250
                  }))]
    })

    assert isinstance(
        sklearn_model_from_param(param_from_sklearn_model(model)),
        model.__class__)
    s = json.dumps(param)  # check that it can be json serialized
    assert isinstance(s, str)

    # GraphPipeline
    model = GraphPipeline(models={
        "enc": NumericalEncoder(),
        "forest": RandomForestClassifier(n_estimators=250)
    },
                          edges=[("enc", "forest")])

    param = param_from_sklearn_model(model, simplify_default=True)
    assert param == ('GraphPipeline', {
        'models': {
            'enc': ('NumericalEncoder', {}),
            'forest': ('RandomForestClassifier', {
                'n_estimators': 250
            })
        },
        'edges': [('enc', 'forest')]
    })

    assert isinstance(
        sklearn_model_from_param(param_from_sklearn_model(model)),
        model.__class__)

    # GraphPipeline with verbose = True
    model = GraphPipeline(models={
        "enc": NumericalEncoder(),
        "forest": RandomForestClassifier(n_estimators=250)
    },
                          edges=[("enc", "forest")],
                          verbose=True)

    param = param_from_sklearn_model(model, simplify_default=True)
    assert param == ('GraphPipeline', {
        'models': {
            'enc': ('NumericalEncoder', {}),
            'forest': ('RandomForestClassifier', {
                'n_estimators': 250
            })
        },
        'edges': [('enc', 'forest')],
        'verbose': True
    })

    s = json.dumps(param)  # check that it can be json serialized
    assert isinstance(s, str)

    model2 = sklearn_model_from_param(param_from_sklearn_model(model))
    assert model2.verbose is True
    assert isinstance(model2, model.__class__)

    # GraphPipeline + composition
    model = GraphPipeline(models={
        "enc":
        NumericalEncoder(),
        "forest":
        BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=1)
    },
                          edges=[("enc", "forest")])

    param = param_from_sklearn_model(model, simplify_default=True)
    assert param == ('GraphPipeline', {
        'edges': [('enc', 'forest')],
        'models': {
            'enc': ('NumericalEncoder', {}),
            'forest': ('BoxCoxTargetTransformer', {
                'll':
                1,
                'model': ('RandomForestClassifier', {
                    'n_estimators': 250
                })
            })
        }
    })

    assert isinstance(
        sklearn_model_from_param(param_from_sklearn_model(model)),
        model.__class__)
    s = json.dumps(param)  # check that it can be json serialized
    assert isinstance(s, str)
Exemple #20
0
def test_RandomModelGenerator_random(num_only, specific_hyper,
                                     only_random_forest):

    dfX, y, auto_ml_config = get_automl_config(num_only)

    if specific_hyper:
        auto_ml_config.specific_hyper = {
            ('Model', 'RandomForestClassifier'): {
                "n_estimators": [10, 20]
            }
        }

    if only_random_forest:
        auto_ml_config.filter_models(Model='RandomForestClassifier')

    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)

    all_gen = []
    for _ in range(10):
        model = random_model_generator.draw_random_graph()
        all_gen.append(model)

        assert isinstance(model, tuple)
        assert len(model) == 3

        Graph, all_models_params, block_to_use = model

        assert hasattr(Graph, "edges")
        assert hasattr(Graph, "nodes")

        assert isinstance(all_models_params, dict)
        for node in Graph.node:
            assert node in all_models_params

        assert isinstance(block_to_use, (tuple, list))
        for b in block_to_use:
            assert b in TypeOfVariables.alls

        result = convert_graph_to_code(Graph,
                                       all_models_params,
                                       also_returns_mapping=True)
        assert isinstance(result, dict)
        assert "name_mapping" in result
        assert "json_code" in result

        model = sklearn_model_from_param(result["json_code"])
        assert hasattr(model, "fit")

        rf_key = ('Model', ('Model', 'RandomForestClassifier'))
        if only_random_forest:
            assert rf_key in all_models_params

        if specific_hyper:
            if rf_key in all_models_params:
                assert all_models_params[rf_key]["n_estimators"] in (10, 20)

    if not only_random_forest:
        assert any([rf_key not in m[1] for m in all_gen
                    ])  # Check that RandomForest wasn't drawn every time

    ### re-draw them thing with other seed ###
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=123)
    all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs1, all_params1, all_blocks1 = zip(*all_gen)
    all_graphs2, all_params2, all_blocks2 = zip(*all_gen2)

    assert not _all_same(all_params1)
    assert not _all_same(all_graphs1)
    if not num_only:
        assert not _all_same(all_blocks1)  # only one block

    all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1]
    all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs2_node_edges
    assert all_params1 == all_params2
    assert all_blocks1 == all_blocks2

    ### re-draw by resetting generator ###
    random_model_generator.random_state = 123
    all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs3, all_params3, all_blocks3 = zip(*all_gen3)
    all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs3_node_edges
    assert all_params1 == all_params3
    assert all_blocks1 == all_blocks3

    ### Re-draw by passing a random sate
    random_state = check_random_state(123)
    random_model_generator = RandomModelGenerator(
        auto_ml_config=auto_ml_config, random_state=random_state)
    all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)]

    all_graphs4, all_params4, all_blocks4 = zip(*all_gen4)
    all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4]
    # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work

    # separate test to isolate exactly what changes
    assert all_graphs1_node_edges == all_graphs4_node_edges
    assert all_params1 == all_params4
    assert all_blocks1 == all_blocks4
    def boxcox_and_graphpipeline(self):

        params = (
            "GraphPipeline",
            {
                "edges": [("NumericalEncoder", "BoxCoxTargetTransformer")],
                "models": {
                    "BoxCoxTargetTransformer": (
                        "BoxCoxTargetTransformer",
                        (
                            "GraphPipeline",
                            {
                                "edges": [("KMeansTransformer",
                                           "RandomForestClassifier")],
                                "models": {
                                    "KMeansTransformer":
                                    ("KMeansTransformer", {
                                        "n_clusters": 10
                                    }),
                                    "RandomForestClassifier":
                                    ("RandomForestClassifier", {
                                        "n_estimators": 10
                                    }),
                                },
                            },
                        ),
                        {
                            "ll": 10
                        },
                    ),
                    "NumericalEncoder": ("NumericalEncoder", {}),
                },
            },
        )

        params_c = copy.deepcopy(params)

        model = sklearn_model_from_param(params_c)

        assert isinstance(model, GraphPipeline)
        assert len(model.models) == 2

        assert "NumericalEncoder" in model.models
        assert isinstance(model.models["NumericalEncoder"], NumericalEncoder)

        assert "BoxCoxTargetTransformer" in model.models
        assert isinstance(model.models["BoxCoxTargetTransformer"],
                          BoxCoxTargetTransformer)

        assert isinstance(model.models["BoxCoxTargetTransformer"].model,
                          GraphPipeline)

        assert set(
            model.models["BoxCoxTargetTransformer"].model.models.keys()) == {
                "KMeansTransformer",
                "RandomForestClassifier",
            }

        assert isinstance(
            model.models["BoxCoxTargetTransformer"].model.
            models["KMeansTransformer"], KMeansTransformer)
        assert isinstance(
            model.models["BoxCoxTargetTransformer"].model.
            models["RandomForestClassifier"], RandomForestClassifier)

        assert params == params_c

        param_reverse = param_from_sklearn_model(
            model
        )  # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute

        assert param_reverse[0] == params[0]