def test_boxcox_target_transformer(self): ## syntax 1 ## param = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {})) param_c = copy.deepcopy(param) model = sklearn_model_from_param(param_c) assert isinstance(model, BoxCoxTargetTransformer) assert isinstance(model.model, RandomForestClassifier) assert param == param_c param_reverse = param_from_sklearn_model( model ) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute assert param_reverse[0] == param[0] ## syntax 2 ## params = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}), { "ll": 10 }) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, BoxCoxTargetTransformer) assert isinstance(model.model, RandomForestClassifier) assert model.ll == 10 assert params == params_c param_reverse = param_from_sklearn_model( model ) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute assert param_reverse[0] == param[0] ## syntax 3 ## params = ("BoxCoxTargetTransformer", { "model": ("RandomForestClassifier", {}), "ll": 10 }) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, BoxCoxTargetTransformer) assert isinstance(model.model, RandomForestClassifier) assert model.ll == 10 assert params == params_c param_reverse = param_from_sklearn_model( model ) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute assert param_reverse == params
def test_RandomModelGenerator_default(): dfX, y, auto_ml_config = get_automl_config() random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) # verif iterator for model in random_model_generator.iterator_default_models(): assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.node: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit")
def test_graph_pipeline(self): ##################### ### GraphPipeline ### ##################### param = ( "GraphPipeline", { "models": { "svd": ("TruncatedSVDWrapper", { "n_components": 3 }), "logit": ("LogisticRegression", { "C": 10 }), }, "edges": [("svd", "logit")], }, ) param_c = copy.deepcopy(param) model = sklearn_model_from_param(param) assert isinstance(model, GraphPipeline) assert isinstance(model.models["logit"], LogisticRegression) assert isinstance(model.models["svd"], TruncatedSVDWrapper) assert model.models["svd"].n_components == 3 assert param == param_c param_reverse = param_from_sklearn_model(model) assert param_reverse == param
def fit_command(self, job_ids): """ this command is to launch the final fit one (or more) model(s) It can be executed using the 'fit' command keyword followed by '--job_ids ***' It will: * reload the data * fit a model on all the data * save the pickled object """ all_models = [] for job_id in job_ids: print("fitting of job_id '%s'" % job_id) self.reload() job_param = self.data_persister.read(job_id, path = "job_param", write_type = SavingType.json) model = sklearn_model_from_param(job_param["model_json"]) print("start fitting...") if function_has_named_argument(model.fit, "groups") and self.groups is not None: model.fit(self.dfX, self.y, groups=self.groups) else: model.fit(self.dfX, self.y) print("...model fitted!") self.data_persister.write(model, job_id, path="saved_models", write_type=SavingType.pickle) self.data_persister.write(job_param["model_json"], job_id, path="saved_models", write_type=SavingType.json) print("model persisted") all_models.append(model) return all_models
def test_graph_pipeline(self): ##################### ### GraphPipeline ### ##################### param3 = ( "GraphPipeline", { "models": { "svd": ("TruncatedSVDWrapper", { "n_components": 2 }), "logit": ("LogisticRegression", { "C": 10 }), }, "edges": [("svd", "logit")], }, ) param3_c = copy.deepcopy(param3) model3 = sklearn_model_from_param(param3) assert isinstance(model3, GraphPipeline) assert isinstance(model3.models["logit"], LogisticRegression) assert isinstance(model3.models["svd"], TruncatedSVDWrapper) assert param3 == param3_c
def test_stacking_classifier(self): params = ( "StackerClassifier", { "models": [("RandomForestClassifier", {}), ("ExtraTreesClassifier", {})], "cv": 5, "blender": ("LogisticRegression", {}), }, ) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, StackerClassifier) assert len(model.models) == 2 assert isinstance(model.models[0], RandomForestClassifier) assert isinstance(model.models[1], ExtraTreesClassifier) assert isinstance(model.blender, LogisticRegression) assert model.cv == 5 param_reverse = param_from_sklearn_model( model ) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute assert param_reverse == params
def test_graph_pipeline_list(self): ##################### ### GraphPipeline ### ##################### # Test when inputs are list and not tuples param = ( "GraphPipeline", { "edges": [["encoder", "imputer", "rf"], ["vect", "svd", "rf"]], "models": { "encoder": ( "NumericalEncoder", { "columns_to_use": [ "^BLOCK_", "^NUMBERTOKEN_", "^DATETOKEN_", "^CURRENCYTOKEN_" ], "regex_match": True, }, ), "imputer": ("NumImputer", {}), "rf": ("RandomForestClassifier", { "n_estimators": 500 }), "svd": ("TruncatedSVDWrapper", { "n_components": 200 }), "vect": ( "CountVectorizerWrapper", { "analyzer": "char", "columns_to_use": ["STRINGLEFTOF", "STRINGABOVEOF"], "ngram_range": [1, 4], }, ), }, }, ) param_c = copy.deepcopy(param) model = sklearn_model_from_param(param) assert isinstance(model, GraphPipeline) assert isinstance(model.models["encoder"], NumericalEncoder) assert isinstance(model.models["imputer"], NumImputer) assert isinstance(model.models["vect"], CountVectorizerWrapper) assert isinstance(model.models["svd"], TruncatedSVDWrapper) assert isinstance(model.models["rf"], RandomForestClassifier) assert param == param_c param_reverse = param_from_sklearn_model(model) assert param_reverse == param
def test_boxcox_target_transformer(self): ## syntax 1 ## params = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {})) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, BoxCoxTargetTransformer) assert isinstance(model.model, RandomForestClassifier) assert params == params_c ## syntax 2 ## params = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}), { "ll": 10 }) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, BoxCoxTargetTransformer) assert isinstance(model.model, RandomForestClassifier) assert model.ll == 10 assert params == params_c ## syntax 3 ## params = ("BoxCoxTargetTransformer", { "model": ("RandomForestClassifier", {}), "ll": 10 }) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, BoxCoxTargetTransformer) assert isinstance(model.model, RandomForestClassifier) assert model.ll == 10 assert params == params_c
def test_logistic_regression(self): ########################### ### Logistic Regression ### ########################### from sklearn.linear_model import LogisticRegression param2 = ("LogisticRegression", {"C": 10}) param2_c = copy.deepcopy(param2) model2 = sklearn_model_from_param(param2) assert isinstance(model2, LogisticRegression) assert model2.C == 10 assert param2 == param2_c # verif that param was not modified inside function
def test_RandomModelGenerator_iterator(type_of_iterator, num_only): dfX, y, auto_ml_config = get_automl_config(num_only) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) if type_of_iterator == "default": iterator = random_model_generator.iterator_default_models() elif type_of_iterator == "block_search": iterator = random_model_generator.iterate_block_search( random_order=False) elif type_of_iterator == "block_search_random": iterator = random_model_generator.iterate_block_search( random_order=True) assert hasattr(iterator, "__iter__") # verif iterator for model in iterator: assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.node: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit")
def test_random_forest(self): ##################### ### Random Forest ### ##################### param1 = ("RandomForestClassifier", { "n_estimators": 100, "criterion": "entropy" }) param1_c = copy.deepcopy(param1) model1 = sklearn_model_from_param(param1) assert isinstance(model1, RandomForestClassifier) assert model1.n_estimators == 100 assert param1 == param1_c # verif that param was not modified inside function
def test_random_forest(self): ##################### ### Random Forest ### ##################### param = ("RandomForestClassifier", { "n_estimators": 150, "criterion": "entropy" }) param_c = copy.deepcopy(param) model = sklearn_model_from_param(param) assert isinstance(model, RandomForestClassifier) assert model.n_estimators == 150 assert param == param_c # verif that param was not modified inside function param_reverse = param_from_sklearn_model(model) assert param_reverse == param
def test_stacking_classifier(self): params = ( "StackerClassifier", { "models": [("RandomForestClassifier", {}), ("ExtraTreesClassifier", {})], "cv": 5, "blender": ("LogisticRegression", {}), }, ) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, StackerClassifier) assert len(model.models) == 2 assert isinstance(model.models[0], RandomForestClassifier) assert isinstance(model.models[1], ExtraTreesClassifier) assert isinstance(model.blender, LogisticRegression) assert model.cv == 5
def boxcox_and_graphpipeline(self): params = ( "GraphPipeline", { "edges": [("NumericalEncoder", "BoxCoxTargetTransformer")], "models": { "BoxCoxTargetTransformer": ( "BoxCoxTargetTransformer", ( "GraphPipeline", { "edges": [("KMeansTransformer", "RandomForestClassifier")], "models": { "KMeansTransformer": ("KMeansTransformer", { "n_clusters": 10 }), "RandomForestClassifier": ("RandomForestClassifier", { "n_estimators": 10 }), }, }, ), { "ll": 10 }, ), "NumericalEncoder": ("NumericalEncoder", {}), }, }, ) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, GraphPipeline) assert len(model.models) == 2 assert "NumericalEncoder" in model.models assert isinstance(model.models["NumericalEncoder"], NumericalEncoder) assert "BoxCoxTargetTransformer" in model.models assert isinstance(model.models["BoxCoxTargetTransformer"], BoxCoxTargetTransformer) assert isinstance(model.models["BoxCoxTargetTransformer"].model, GraphPipeline) assert set( model.models["BoxCoxTargetTransformer"].model.models.keys()) == { "KMeansTransformer", "RandomForestClassifier", } assert isinstance( model.models["BoxCoxTargetTransformer"].model. models["KMeansTransformer"], KMeansTransformer) assert isinstance( model.models["BoxCoxTargetTransformer"].model. models["RandomForestClassifier"], RandomForestClassifier) assert params == params_c
def test_RandomModelGenerator_random(): dfX, y, auto_ml_config = get_automl_config() random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen = [] for _ in range(10): model = random_model_generator.draw_random_graph() all_gen.append(model) assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.node: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit") ### re-draw them thing with other seed ### random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs1, all_params1, all_blocks1 = zip(*all_gen) all_graphs2, all_params2, all_blocks2 = zip(*all_gen2) assert not _all_same(all_params1) assert not _all_same(all_graphs1) assert not _all_same(all_blocks1) all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1] all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs2_node_edges assert all_params1 == all_params2 assert all_blocks1 == all_blocks2 ### re-draw by resetting generator ### random_model_generator.random_state = 123 all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs3, all_params3, all_blocks3 = zip(*all_gen3) all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs3_node_edges assert all_params1 == all_params3 assert all_blocks1 == all_blocks3 ### Re-draw by passing a random sate random_state = check_random_state(123) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=random_state) all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs4, all_params4, all_blocks4 = zip(*all_gen4) all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs4_node_edges assert all_params1 == all_params4 assert all_blocks1 == all_blocks4
def _load_model(self, job_id): job = self.storage.load_special_json(job_id, 'jobs') return sklearn_model_from_param(job['model_code'])
def test_RandomModelGenerator_iterator(type_of_iterator, num_only): dfX, y, auto_ml_config = get_automl_config(num_only) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) if type_of_iterator == "default": iterator = random_model_generator.iterator_default_models() elif type_of_iterator == "block_search": iterator = random_model_generator.iterate_block_search( random_order=False) elif type_of_iterator == "block_search_random": iterator = random_model_generator.iterate_block_search( random_order=True) assert hasattr(iterator, "__iter__") # verif iterator for model in iterator: assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model terminal_nodes = get_terminal_nodes(Graph) assert len(terminal_nodes) == 1 assert terminal_nodes[0][0] == StepCategories.Model #graphviz_graph(Graph) assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.nodes: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result sk_model = sklearn_model_from_param(result["json_code"]) assert hasattr(sk_model, "fit") if type_of_iterator == "default" and ('Model', ( 'Model', 'RandomForestClassifier')) in Graph.nodes: # in that case I'll actually do the fitting here # I'll simplify the model to have 2 estimators (faster) all_models_params[('Model', ('Model', 'RandomForestClassifier'))]["n_estimators"] = 2 result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) sk_model = sklearn_model_from_param(result["json_code"]) sub_index = np.concatenate( (np.where(y == 0)[0][0:10], np.where(y == 1)[0][0:10]), axis=0) # Needs at least 20 observations to make sure all transformers works sk_model.fit(dfX.iloc[sub_index, :], y[sub_index]) yhat = sk_model.predict(dfX.head(2)) assert yhat.shape == (2, )
def test_RandomModelGenerator_random(num_only, specific_hyper, only_random_forest): #num_only, specific_hyper, only_random_forest = False, True, True dfX, y, auto_ml_config = get_automl_config(num_only) if specific_hyper: auto_ml_config.specific_hyper = { ("Model", "RandomForestClassifier"): { "n_estimators": [10, 20] } } if only_random_forest: auto_ml_config.filter_models(Model="RandomForestClassifier") random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen = [] for _ in range(10): model = random_model_generator.draw_random_graph() all_gen.append(model) assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.nodes: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result sk_model = sklearn_model_from_param(result["json_code"]) assert hasattr(sk_model, "fit") rf_key = ("Model", ("Model", "RandomForestClassifier")) if only_random_forest: assert rf_key in all_models_params if specific_hyper: if rf_key in all_models_params: assert all_models_params[rf_key]["n_estimators"] in (10, 20) if ('Model', ('Model', 'RandomForestClassifier')) in Graph.nodes: # in that case I'll actually do the fitting here # I'll simplify the model to have 2 estimators (faster) all_models_params_copy = deepcopy(all_models_params) all_models_params_copy[('Model', ( 'Model', 'RandomForestClassifier'))]["n_estimators"] = 2 result = convert_graph_to_code(Graph, all_models_params_copy, also_returns_mapping=True) sk_model = sklearn_model_from_param(result["json_code"]) sub_index = np.concatenate( (np.where(y == 0)[0][0:100], np.where(y == 1)[0][0:100]), axis=0) # Needs at least 20 observations to make sure all transformers works if hasattr(sk_model, "verbose"): sk_model.verbose = True sk_model.fit(dfX.iloc[sub_index, :], y[sub_index]) yhat = sk_model.predict(dfX.head(2)) assert yhat.shape == (2, ) if not only_random_forest: assert any([rf_key not in m[1] for m in all_gen ]) # Check that RandomForest wasn't drawn every time ### re-draw them thing with other seed ### random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs1, all_params1, all_blocks1 = zip(*all_gen) all_graphs2, all_params2, all_blocks2 = zip(*all_gen2) assert not _all_same(all_params1) assert not _all_same(all_graphs1) if not num_only: assert not _all_same(all_blocks1) # only one block all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1] all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs2_node_edges assert all_params1 == all_params2 assert all_blocks1 == all_blocks2 ### re-draw by resetting generator ### random_model_generator.random_state = 123 all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs3, all_params3, all_blocks3 = zip(*all_gen3) all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs3_node_edges assert all_params1 == all_params3 assert all_blocks1 == all_blocks3 ### Re-draw by passing a random sate random_state = check_random_state(123) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=random_state) all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs4, all_params4, all_blocks4 = zip(*all_gen4) all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs4_node_edges assert all_params1 == all_params4 assert all_blocks1 == all_blocks4
def test_param_from_sklearn_model(): # simple RandomForest model = RandomForestClassifier(n_estimators=250) assert RandomForestClassifier().get_params()["n_estimators"] != 250 assert param_from_sklearn_model( model, simplify_default=True) == ('RandomForestClassifier', { 'n_estimators': 250 }) param = param_from_sklearn_model(model, simplify_default=False) assert isinstance(param, tuple) assert len(param) == 2 assert param[0] == "RandomForestClassifier" assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) # Composition model : BoxCoxTargetTransformer of RandomForestClassifier model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=0) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('BoxCoxTargetTransformer', { 'model': ('RandomForestClassifier', { 'n_estimators': 250 }) }) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) # Composition model : BoxCoxTargetTransformer of RandomForestClassifier model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=1) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('BoxCoxTargetTransformer', { 'll': 1, 'model': ('RandomForestClassifier', { 'n_estimators': 250 }) }) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) # Pipeline model = Pipeline([("enc", NumericalEncoder()), ("forest", RandomForestClassifier(n_estimators=250))]) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('Pipeline', { 'steps': [('enc', ('NumericalEncoder', {})), ('forest', ('RandomForestClassifier', { 'n_estimators': 250 }))] }) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) # GraphPipeline model = GraphPipeline(models={ "enc": NumericalEncoder(), "forest": RandomForestClassifier(n_estimators=250) }, edges=[("enc", "forest")]) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('GraphPipeline', { 'models': { 'enc': ('NumericalEncoder', {}), 'forest': ('RandomForestClassifier', { 'n_estimators': 250 }) }, 'edges': [('enc', 'forest')] }) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) # GraphPipeline with verbose = True model = GraphPipeline(models={ "enc": NumericalEncoder(), "forest": RandomForestClassifier(n_estimators=250) }, edges=[("enc", "forest")], verbose=True) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('GraphPipeline', { 'models': { 'enc': ('NumericalEncoder', {}), 'forest': ('RandomForestClassifier', { 'n_estimators': 250 }) }, 'edges': [('enc', 'forest')], 'verbose': True }) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str) model2 = sklearn_model_from_param(param_from_sklearn_model(model)) assert model2.verbose is True assert isinstance(model2, model.__class__) # GraphPipeline + composition model = GraphPipeline(models={ "enc": NumericalEncoder(), "forest": BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=1) }, edges=[("enc", "forest")]) param = param_from_sklearn_model(model, simplify_default=True) assert param == ('GraphPipeline', { 'edges': [('enc', 'forest')], 'models': { 'enc': ('NumericalEncoder', {}), 'forest': ('BoxCoxTargetTransformer', { 'll': 1, 'model': ('RandomForestClassifier', { 'n_estimators': 250 }) }) } }) assert isinstance( sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) s = json.dumps(param) # check that it can be json serialized assert isinstance(s, str)
def test_RandomModelGenerator_random(num_only, specific_hyper, only_random_forest): dfX, y, auto_ml_config = get_automl_config(num_only) if specific_hyper: auto_ml_config.specific_hyper = { ('Model', 'RandomForestClassifier'): { "n_estimators": [10, 20] } } if only_random_forest: auto_ml_config.filter_models(Model='RandomForestClassifier') random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen = [] for _ in range(10): model = random_model_generator.draw_random_graph() all_gen.append(model) assert isinstance(model, tuple) assert len(model) == 3 Graph, all_models_params, block_to_use = model assert hasattr(Graph, "edges") assert hasattr(Graph, "nodes") assert isinstance(all_models_params, dict) for node in Graph.node: assert node in all_models_params assert isinstance(block_to_use, (tuple, list)) for b in block_to_use: assert b in TypeOfVariables.alls result = convert_graph_to_code(Graph, all_models_params, also_returns_mapping=True) assert isinstance(result, dict) assert "name_mapping" in result assert "json_code" in result model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit") rf_key = ('Model', ('Model', 'RandomForestClassifier')) if only_random_forest: assert rf_key in all_models_params if specific_hyper: if rf_key in all_models_params: assert all_models_params[rf_key]["n_estimators"] in (10, 20) if not only_random_forest: assert any([rf_key not in m[1] for m in all_gen ]) # Check that RandomForest wasn't drawn every time ### re-draw them thing with other seed ### random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=123) all_gen2 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs1, all_params1, all_blocks1 = zip(*all_gen) all_graphs2, all_params2, all_blocks2 = zip(*all_gen2) assert not _all_same(all_params1) assert not _all_same(all_graphs1) if not num_only: assert not _all_same(all_blocks1) # only one block all_graphs1_node_edges = [(g.nodes, g.edges) for g in all_graphs1] all_graphs2_node_edges = [(g.nodes, g.edges) for g in all_graphs2] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs2_node_edges assert all_params1 == all_params2 assert all_blocks1 == all_blocks2 ### re-draw by resetting generator ### random_model_generator.random_state = 123 all_gen3 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs3, all_params3, all_blocks3 = zip(*all_gen3) all_graphs3_node_edges = [(g.nodes, g.edges) for g in all_graphs3] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs3_node_edges assert all_params1 == all_params3 assert all_blocks1 == all_blocks3 ### Re-draw by passing a random sate random_state = check_random_state(123) random_model_generator = RandomModelGenerator( auto_ml_config=auto_ml_config, random_state=random_state) all_gen4 = [random_model_generator.draw_random_graph() for _ in range(10)] all_graphs4, all_params4, all_blocks4 = zip(*all_gen4) all_graphs4_node_edges = [(g.nodes, g.edges) for g in all_graphs4] # I need to test equality of nodes and edgs ... directly == on networkx graph doesn't work # separate test to isolate exactly what changes assert all_graphs1_node_edges == all_graphs4_node_edges assert all_params1 == all_params4 assert all_blocks1 == all_blocks4
def boxcox_and_graphpipeline(self): params = ( "GraphPipeline", { "edges": [("NumericalEncoder", "BoxCoxTargetTransformer")], "models": { "BoxCoxTargetTransformer": ( "BoxCoxTargetTransformer", ( "GraphPipeline", { "edges": [("KMeansTransformer", "RandomForestClassifier")], "models": { "KMeansTransformer": ("KMeansTransformer", { "n_clusters": 10 }), "RandomForestClassifier": ("RandomForestClassifier", { "n_estimators": 10 }), }, }, ), { "ll": 10 }, ), "NumericalEncoder": ("NumericalEncoder", {}), }, }, ) params_c = copy.deepcopy(params) model = sklearn_model_from_param(params_c) assert isinstance(model, GraphPipeline) assert len(model.models) == 2 assert "NumericalEncoder" in model.models assert isinstance(model.models["NumericalEncoder"], NumericalEncoder) assert "BoxCoxTargetTransformer" in model.models assert isinstance(model.models["BoxCoxTargetTransformer"], BoxCoxTargetTransformer) assert isinstance(model.models["BoxCoxTargetTransformer"].model, GraphPipeline) assert set( model.models["BoxCoxTargetTransformer"].model.models.keys()) == { "KMeansTransformer", "RandomForestClassifier", } assert isinstance( model.models["BoxCoxTargetTransformer"].model. models["KMeansTransformer"], KMeansTransformer) assert isinstance( model.models["BoxCoxTargetTransformer"].model. models["RandomForestClassifier"], RandomForestClassifier) assert params == params_c param_reverse = param_from_sklearn_model( model ) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute assert param_reverse[0] == params[0]