def make_secondary_node_as_primary(node_child): extracted_type = node_child.operation.operation_type new_primary_node = PrimaryNode(extracted_type) this_node_children = self.node_children(node_child) for node in this_node_children: index = node.nodes_from.index(node_child) node.nodes_from.remove(node_child) node.nodes_from.insert(index, new_primary_node)
def chain_with_only_data_operations(): first = PrimaryNode(operation_type='one_hot_encoding') second = SecondaryNode(operation_type='scaling', nodes_from=[first]) final = SecondaryNode(operation_type='ransac_lin_reg', nodes_from=[second]) chain = Chain(final) return chain
def test_only_ts_specific_operations_are_primary(): """ Incorrect chain lagged \ linear -> final forecast ridge / """ node_lagged = PrimaryNode('lagged') node_ridge = PrimaryNode('ridge') node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_ridge]) incorrect_chain = Chain(node_final) with pytest.raises(Exception) as exc: assert only_ts_specific_operations_are_primary(incorrect_chain) assert str( exc.value ) == f'{ERROR_PREFIX} Chain for forecasting has not ts_specific preprocessing in primary nodes'
def create_four_depth_chain(): knn_node = PrimaryNode('knn') lda_node = PrimaryNode('lda') xgb_node = PrimaryNode('xgboost') logit_node = PrimaryNode('logit') logit_node_second = SecondaryNode('logit', nodes_from=[knn_node, lda_node]) xgb_node_second = SecondaryNode('xgboost', nodes_from=[logit_node]) qda_node_third = SecondaryNode('qda', nodes_from=[xgb_node_second]) knn_node_third = SecondaryNode('knn', nodes_from=[logit_node_second, xgb_node]) knn_root = SecondaryNode('knn', nodes_from=[qda_node_third, knn_node_third]) chain = Chain(knn_root) return chain
def test_chain_str(): # given first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) chain = Chain() chain.add_node(final) expected_chain_description = "{'depth': 2, 'length': 4, 'nodes': [xgboost, logit, lda, knn]}" # when actual_chain_description = str(chain) # then assert actual_chain_description == expected_chain_description
def get_complex_regr_chain(): node_scaling = PrimaryNode(operation_type='scaling') node_ridge = SecondaryNode('ridge', nodes_from=[node_scaling]) node_linear = SecondaryNode('linear', nodes_from=[node_scaling]) final = SecondaryNode('xgbreg', nodes_from=[node_ridge, node_linear]) chain = Chain(final) return chain
def get_multiscale_chain(model_trend='lstm', model_residual='ridge'): node_trend = PrimaryNode('trend_data_model') node_first_trend = SecondaryNode(model_trend, nodes_from=[node_trend]) if model_trend == 'lstm': # decrease the number of epochs to fit node_first_trend.model.params = {'epochs': 1} node_residual = PrimaryNode('residual_data_model') node_model_residual = SecondaryNode(model_residual, nodes_from=[node_residual]) node_final = SecondaryNode( 'linear', nodes_from=[node_model_residual, node_first_trend]) chain = Chain(node_final) return chain
def default_valid_chain(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[first]) final = SecondaryNode(model_type='logit', nodes_from=[second, third]) chain = Chain(final) return chain
def get_ar_chain(): """ Function return chain with AR model """ node_ar = PrimaryNode('ar') chain = Chain(node_ar) return chain
def chain_with_pca() -> Chain: node_scaling = PrimaryNode('scaling') node_pca = SecondaryNode('pca', nodes_from=[node_scaling]) node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_final = SecondaryNode('rf', nodes_from=[node_pca, node_lda]) chain = Chain(node_final) return chain
def test_node_factory_log_reg_correct(data_setup): model_type = 'logit' node = PrimaryNode(model_type=model_type) expected_model = Model(model_type=model_type).__class__ actual_model = node.model.__class__ assert node.__class__ == PrimaryNode assert expected_model == actual_model
def chain_fourth(): # XG # | \ # XG KNN # | \ | \ # QDA KNN LR LDA # | \ | \ # RF RF KNN KNN chain = chain_first() new_node = SecondaryNode('qda') for model_type in ('rf', 'rf'): new_node.nodes_from.append(PrimaryNode(model_type)) chain.update_subtree(chain.root_node.nodes_from[0].nodes_from[1], new_node) new_node = SecondaryNode('knn') for model_type in ('knn', 'knn'): new_node.nodes_from.append(PrimaryNode(model_type)) chain.update_subtree(chain.root_node.nodes_from[0].nodes_from[0], new_node) return chain
def chain_simple() -> Chain: node_scaling = PrimaryNode('scaling') node_svc = SecondaryNode('svc', nodes_from=[node_scaling]) node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_final = SecondaryNode('rf', nodes_from=[node_svc, node_lda]) chain = Chain(node_final) return chain
def get_simple_chain(): """ Function return simple chain with the following structure lagged -> linear """ node_lagged = PrimaryNode('lagged') node_final = SecondaryNode('linear', nodes_from=[node_lagged]) chain_simple = Chain(node_final) return chain_simple
def test_chain_fine_tune_all_nodes_correct(classification_dataset): data = classification_dataset first = PrimaryNode(operation_type='scaling') second = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='dt', nodes_from=[first, second]) chain = Chain(final) iterations_total, time_limit_minutes = 5, 1 tuned_chain = chain.fine_tune_all_nodes(loss_function=roc, input_data=data, iterations=iterations_total, max_lead_time=time_limit_minutes) tuned_chain.predict(input_data=data) is_tuning_finished = True assert is_tuning_finished
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier() ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(roc_auc_value) chain = Chain() node_first = PrimaryNode('direct_data_model') node_second = PrimaryNode('bernb') node_third = SecondaryNode('rf') node_third.nodes_from.append(node_first) node_third.nodes_from.append(node_second) chain.add_node(node_third) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def chain_with_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second, first]) second.nodes_from.append(third) chain = Chain() for node in [first, second, third]: chain.add_node(node) return chain
def test_node_deletion_sample_method(): # given _, train_data, test_data, node_index, result_dir = given_data() primary_first = PrimaryNode('knn') primary_second = PrimaryNode('knn') central = SecondaryNode('xgboost', nodes_from=[primary_first, primary_second]) secondary_first = SecondaryNode('lda', nodes_from=[central]) secondary_second = SecondaryNode('lda', nodes_from=[central]) root = SecondaryNode('logit', nodes_from=[secondary_first, secondary_second]) chain_with_multiple_children = Chain(nodes=root) # when result = NodeDeletionAnalyze(chain=chain_with_multiple_children, train_data=train_data, test_data=test_data, path_to_save=result_dir).sample(node_index) # then assert result is None
def chain_with_multiple_roots(): first = PrimaryNode(operation_type='logit') root_first = SecondaryNode(operation_type='logit', nodes_from=[first]) root_second = SecondaryNode(operation_type='logit', nodes_from=[first]) chain = Chain() for node in [first, root_first, root_second]: chain.add_node(node) return chain
def baseline_chain(): chain = Chain() last_node = SecondaryNode(model_type='xgboost', nodes_from=[]) for requirement_model in ['knn', 'logit']: new_node = PrimaryNode(requirement_model) chain.add_node(new_node) last_node.nodes_from.append(new_node) chain.add_node(last_node) return chain
def chain_with_self_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) second.nodes_from.append(second) chain = Chain() chain.add_node(first) chain.add_node(second) return chain
def test_fixed_structure_composer(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data available_operation_types = ['logit', 'lda', 'knn'] metric_function = ClassificationMetricsEnum.ROCAUC req = GPComposerRequirements(primary=available_operation_types, secondary=available_operation_types, pop_size=2, num_of_generations=1, crossover_prob=0.4, mutation_prob=0.5, allow_single_operations=False) # Prepare init chain first = PrimaryNode(operation_type='xgboost') second = PrimaryNode(operation_type='scaling') final = SecondaryNode(operation_type='logit', nodes_from=[first, second]) reference_chain = Chain(final) builder = FixedStructureComposerBuilder( task=Task(TaskTypesEnum.classification)).with_initial_chain( reference_chain).with_metrics(metric_function).with_requirements( req) composer = builder.build() chain_composed = composer.compose_chain(data=dataset_to_compose) chain_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_random_composed = chain_composed.predict(dataset_to_validate) roc_on_valid_random_composed = roc_auc( y_true=dataset_to_validate.target, y_score=predicted_random_composed.predict) assert roc_on_valid_random_composed > 0.6 assert chain_composed.depth == reference_chain.depth assert chain_composed.length == reference_chain.length
def test_delete_primary_node(): # given first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = SecondaryNode(operation_type='knn', nodes_from=[first]) final = SecondaryNode(operation_type='xgboost', nodes_from=[second, third]) chain = Chain() chain.add_node(final) # when chain.delete_node(first) new_primary_node = [ node for node in chain.nodes if node.operation.operation_type == 'knn' ][0] # then assert len(chain.nodes) == 3 assert isinstance(new_primary_node, PrimaryNode)
def get_ensemble_chain(): chain = Chain() nodes_list = [] for model in ['linear', 'ridge', 'lasso', 'rfr', 'dtreg', 'knnreg', 'svr']: node = PrimaryNode(model) chain.add_node(node) nodes_list.append(node) node_final = SecondaryNode('linear', nodes_from=nodes_list) chain.add_node(node_final) return chain
def chain_with_isolated_nodes(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[second]) isolated = SecondaryNode(model_type='logit', nodes_from=[]) chain = Chain() for node in [first, second, third, isolated]: chain.add_node(node) return chain
def test_chain_with_datamodel_fit_correct(data_setup): data = data_setup train_data, test_data = train_test_data_setup(data) chain = Chain() node_data = PrimaryNode('logit') node_first = PrimaryNode('bernb') node_second = SecondaryNode('rf') node_second.nodes_from = [node_first, node_data] chain.add_node(node_data) chain.add_node(node_first) chain.add_node(node_second) chain.fit(train_data) results = np.asarray(probs_to_labels(chain.predict(test_data).predict)) assert results.shape == test_data.target.shape
def valid_chain(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second]) last = SecondaryNode(operation_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, last]: chain.add_node(node) return chain
def chain_with_isolated_components(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[]) fourth = SecondaryNode(operation_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, fourth]: chain.add_node(node) return chain
def test_node_repr(): # given operation_type = 'logit' test_model_node = PrimaryNode(operation_type=operation_type) expected_node_description = operation_type # when actual_node_description = repr(test_model_node) # then assert actual_node_description == expected_node_description
def chain_third(): # QDA # | \ # RF RF chain = Chain() new_node = SecondaryNode('qda') for model_type in ('rf', 'rf'): new_node.nodes_from.append(PrimaryNode(model_type)) chain.add_node(new_node) [chain.add_node(node_from) for node_from in new_node.nodes_from] return chain