def create_chain_with_several_nested_atomized_model() -> Chain: chain = Chain() node_atomized_model = PrimaryNode( model_type=create_atomized_model_with_several_atomized_models()) node_atomized_model_secondary = SecondaryNode( model_type=create_atomized_model()) node_atomized_model_secondary.nodes_from = [node_atomized_model] node_knn = SecondaryNode('knn') node_knn.custom_params = {'n_neighbors': 9} node_knn.nodes_from = [node_atomized_model] node_knn_second = SecondaryNode('knn') node_knn_second.custom_params = {'n_neighbors': 5} node_knn_second.nodes_from = [ node_atomized_model, node_atomized_model_secondary, node_knn ] node_atomized_model_secondary_second = \ SecondaryNode(model_type=create_atomized_model_with_several_atomized_models()) node_atomized_model_secondary_second.nodes_from = [node_knn_second] chain.add_node(node_atomized_model_secondary_second) return chain
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run chain with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for chain """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_tpot = PrimaryNode('tpot') node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds} node_lda = PrimaryNode('lda') node_rf = SecondaryNode('rf') node_rf.nodes_from = [node_tpot, node_lda] chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def fine_tune_certain_node(self, model_id, input_data: InputData, iterations: int = 30, max_lead_time: timedelta = timedelta(minutes=5)): """ Optimize hyperparameters of models in the certain node, defined by model id :param int model_id: number of the certain model in the chain. Look for it in exported json file of your model. :param input_data: data used for tuning :param iterations: max number of iterations :param max_lead_time: max time available for tuning process :param verbose: flag used for status printing to console, default False :return: updated chain object """ subchain = Chain() new_root = extract_subtree_root(root_model_id=model_id, chain_template=self.chain_template) subchain.add_node(new_root) subchain.fit(input_data=input_data, use_cache=False) updated_subchain = Tune(subchain).fine_tune_root_node(input_data=input_data, iterations=iterations, max_lead_time=max_lead_time) self._update_template(model_id=model_id, updated_node=updated_subchain.root_node) updated_chain = Chain() self.chain_template.convert_to_chain(chain=updated_chain) return updated_chain
def test_chain_hierarchy_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[first]) final = SecondaryNode(operation_type='logit', nodes_from=[second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) chain.unfit() train_predicted = chain.fit(input_data=train) assert chain.root_node.descriptive_id == ('((/n_logit_default_params;)/' 'n_logit_default_params;;(/' 'n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params') assert chain.length == 4 assert chain.depth == 3 assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.fitted_operation is not None
def chain_with_secondary_nodes_only(): first = SecondaryNode(operation_type='logit', nodes_from=[]) second = SecondaryNode(operation_type='logit', nodes_from=[first]) chain = Chain() chain.add_node(first) chain.add_node(second) return chain
def chain_with_self_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) second.nodes_from.append(second) chain = Chain() chain.add_node(first) chain.add_node(second) return chain
def chain_with_multiple_roots(): first = PrimaryNode(operation_type='logit') root_first = SecondaryNode(operation_type='logit', nodes_from=[first]) root_second = SecondaryNode(operation_type='logit', nodes_from=[first]) chain = Chain() for node in [first, root_first, root_second]: chain.add_node(node) return chain
def chain_with_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second, first]) second.nodes_from.append(third) chain = Chain() for node in [first, second, third]: chain.add_node(node) return chain
def baseline_chain(): chain = Chain() last_node = SecondaryNode(model_type='xgboost', nodes_from=[]) for requirement_model in ['knn', 'logit']: new_node = PrimaryNode(requirement_model) chain.add_node(new_node) last_node.nodes_from.append(new_node) chain.add_node(last_node) return chain
def chain_with_isolated_components(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[]) fourth = SecondaryNode(operation_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, fourth]: chain.add_node(node) return chain
def valid_chain(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second]) last = SecondaryNode(operation_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, last]: chain.add_node(node) return chain
def chain_with_isolated_nodes(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[second]) isolated = SecondaryNode(model_type='logit', nodes_from=[]) chain = Chain() for node in [first, second, third, isolated]: chain.add_node(node) return chain
def two_level_chain(): first = PrimaryNode(model_type='logit') second = PrimaryNode(model_type='knn') third = SecondaryNode(model_type='xgboost', nodes_from=[first, second]) chain = Chain() for node in [first, second, third]: chain.add_node(node) return chain
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup): data = data_setup train, test = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) first = deepcopy(first) second = deepcopy(second) third = deepcopy(third) final_shuffled = SecondaryNode(operation_type='xgboost', nodes_from=[third, first, second]) chain_shuffled = Chain() # change order of nodes in list for node in [final_shuffled, third, first, second]: chain_shuffled.add_node(node) train_predicted = chain.fit(input_data=train) train_predicted_shuffled = chain_shuffled.fit(input_data=train) # train results should be invariant assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id assert np.equal(train_predicted.predict, train_predicted_shuffled.predict).all() test_predicted = chain.predict(input_data=test) test_predicted_shuffled = chain_shuffled.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_shuffled.predict).all() # change parents order for the nodes fitted chain nodes_for_change = chain.nodes[3].nodes_from chain.nodes[3].nodes_from = [ nodes_for_change[2], nodes_for_change[0], nodes_for_change[1] ] chain.nodes[3].unfit() chain.fit(train) test_predicted_re_shuffled = chain.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_re_shuffled.predict).all()
def chain_third(): # QDA # | \ # RF RF chain = Chain() new_node = SecondaryNode('qda') for model_type in ('rf', 'rf'): new_node.nodes_from.append(PrimaryNode(model_type)) chain.add_node(new_node) [chain.add_node(node_from) for node_from in new_node.nodes_from] return chain
def get_ensemble_chain(): chain = Chain() nodes_list = [] for model in ['linear', 'ridge', 'lasso', 'rfr', 'dtreg', 'knnreg', 'svr']: node = PrimaryNode(model) chain.add_node(node) nodes_list.append(node) node_final = SecondaryNode('linear', nodes_from=nodes_list) chain.add_node(node_final) return chain
def get_composite_multiscale_chain(): chain = Chain() node_trend = PrimaryNode('trend_data_model') node_lstm_trend = SecondaryNode('ridge', nodes_from=[node_trend]) node_residual = PrimaryNode('residual_data_model') node_ridge_residual = SecondaryNode('ridge', nodes_from=[node_residual]) node_final = SecondaryNode( 'linear', nodes_from=[node_ridge_residual, node_lstm_trend]) chain.add_node(node_final) return chain
def test_delete_node_with_redirection(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = SecondaryNode(operation_type='knn', nodes_from=[first, second]) final = SecondaryNode(operation_type='xgboost', nodes_from=[third]) chain = Chain() chain.add_node(final) chain.delete_node(third) assert len(chain.nodes) == 3 assert first in chain.root_node.nodes_from
def test_update_node_in_chain_raise_exception(): first = PrimaryNode(operation_type='logit') final = SecondaryNode(operation_type='xgboost', nodes_from=[first]) chain = Chain() chain.add_node(final) replacing_node = SecondaryNode('logit') with pytest.raises(ValueError) as exc: chain.update_node(old_node=first, new_node=replacing_node) assert str(exc.value) == "Can't update PrimaryNode with SecondaryNode"
def test_chain_repr(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) chain = Chain() chain.add_node(final) expected_chain_description = "{'depth': 2, 'length': 4, 'nodes': [xgboost, logit, lda, knn]}" assert repr(chain) == expected_chain_description
def chain_third(): # XG # | | \ # KNN LDA KNN root_of_tree = SecondaryNode('xgboost') for model_type in ('knn', 'lda', 'knn'): root_of_tree.nodes_from.append(PrimaryNode(model_type)) chain = Chain() for node in root_of_tree.nodes_from: chain.add_node(node) chain.add_node(root_of_tree) return chain
def create_chain() -> Chain: chain = Chain() node_logit = PrimaryNode('logit') node_lda = PrimaryNode('lda') node_lda.custom_params = {'n_components': 1} node_xgboost = SecondaryNode('xgboost') node_xgboost.custom_params = {'n_components': 1} node_xgboost.nodes_from = [node_logit, node_lda] chain.add_node(node_xgboost) return chain
def create_atomized_model_with_several_atomized_models() -> AtomizedModel: chain = Chain() node_atomized_model_primary = PrimaryNode(operation_type=create_atomized_model()) node_atomized_model_secondary = SecondaryNode(operation_type=create_atomized_model()) node_atomized_model_secondary_second = SecondaryNode(operation_type=create_atomized_model()) node_atomized_model_secondary_third = SecondaryNode(operation_type=create_atomized_model()) node_atomized_model_secondary.nodes_from = [node_atomized_model_primary] node_atomized_model_secondary_second.nodes_from = [node_atomized_model_primary] node_atomized_model_secondary_third.nodes_from = [node_atomized_model_secondary, node_atomized_model_secondary_second] chain.add_node(node_atomized_model_secondary_third) atomized_model = AtomizedModel(chain) return atomized_model
def get_composite_chain(): """ Function return complex chain with the following structure lagged -> ridge \ ridge lagged -> treg | """ chain = Chain() node_lagged_1 = PrimaryNode('lagged') node_lagged_1.custom_params = {'window_size': 110} node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged_1]) node_lagged_2 = PrimaryNode('lagged') node_lagged_2.custom_params = {'window_size': 20} node_treg = SecondaryNode('treg', nodes_from=[node_lagged_2]) node_final = SecondaryNode('ridge', nodes_from=[node_treg, node_ridge]) chain.add_node(node_final) return chain
def test_delete_primary_node(): # given first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = SecondaryNode(operation_type='knn', nodes_from=[first]) final = SecondaryNode(operation_type='xgboost', nodes_from=[second, third]) chain = Chain() chain.add_node(final) # when chain.delete_node(first) new_primary_node = [ node for node in chain.nodes if node.operation.operation_type == 'knn' ][0] # then assert len(chain.nodes) == 3 assert isinstance(new_primary_node, PrimaryNode)
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier() ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(roc_auc_value) chain = Chain() node_first = PrimaryNode('direct_data_model') node_second = PrimaryNode('bernb') node_third = SecondaryNode('rf') node_third.nodes_from.append(node_first) node_third.nodes_from.append(node_second) chain.add_node(node_third) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def test_chain_with_custom_params_for_model(data_setup): data = data_setup custom_params = dict(n_neighbors=1, weights='uniform', p=1) first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') final = SecondaryNode(operation_type='knn', nodes_from=[first, second]) chain = Chain() chain.add_node(final) chain_default_params = deepcopy(chain) chain.root_node.custom_params = custom_params chain_default_params.fit(data) chain.fit(data) custom_params_prediction = chain.predict(data).predict default_params_prediction = chain_default_params.predict(data).predict assert not np.array_equal(custom_params_prediction, default_params_prediction)
def test_chain_sequential_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[second]) final = SecondaryNode(model_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) train_predicted = chain.fit(input_data=train, use_cache=False) assert chain.root_node.descriptive_id == ('(((/n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params') assert chain.length == 4 assert chain.depth == 4 assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.cache.actual_cached_state is not None
def real_chain(chain_template, with_cache=True): nodes_by_templates = [] for level in range(0, len(chain_template)): for template in chain_template[level]: if len(template.parents) == 0: node = PrimaryNode(model_type=template.model_type) else: node = SecondaryNode(nodes_from=real_parents( nodes_by_templates, template), model_type=template.model_type) node.model = template.model_instance if with_cache: cache = FittedModelCache(related_node=node) cache.append( CachedState(preprocessor=template.preprocessor, model=template.fitted_model)) node.cache = cache nodes_by_templates.append((node, template)) chain = Chain() for node, _ in nodes_by_templates: chain.add_node(node) return chain