def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_tpot = PrimaryNode('tpot') node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds} node_lda = PrimaryNode('lda') node_rf = SecondaryNode('rf') node_rf.nodes_from = [node_tpot, node_lda] chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def test_chain_hierarchy_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) third = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) train_predicted = chain.fit(input_data=train, use_cache=False) assert chain.root_node.descriptive_id == ( '((/n_ModelTypesIdsEnum.logit_defaultparams;)/' 'n_ModelTypesIdsEnum.logit_defaultparams;;(/' 'n_ModelTypesIdsEnum.logit_defaultparams;)/' 'n_ModelTypesIdsEnum.logit_defaultparams;)/' 'n_ModelTypesIdsEnum.logit_defaultparams') assert chain.length == 4 assert chain.depth == 3 assert train_predicted.predict.shape == train.target.shape
def chain_with_secondary_nodes_only(): first = SecondaryNode(model_type='logit', nodes_from=[]) second = SecondaryNode(model_type='logit', nodes_from=[first]) chain = Chain() chain.add_node(first) chain.add_node(second) return chain
def chain_with_cycle(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[second, first]) second.nodes_from.append(third) chain = Chain() for node in [first, second, third]: chain.add_node(node) return chain
def chain_with_multiple_roots(): first = PrimaryNode(model_type='logit') root_first = SecondaryNode(model_type='logit', nodes_from=[first]) root_second = SecondaryNode(model_type='logit', nodes_from=[first]) chain = Chain() for node in [first, root_first, root_second]: chain.add_node(node) return chain
def baseline_chain(): chain = Chain() last_node = SecondaryNode(model_type='xgboost', nodes_from=[]) for requirement_model in ['knn', 'logit']: new_node = PrimaryNode(requirement_model) chain.add_node(new_node) last_node.nodes_from.append(new_node) chain.add_node(last_node) return chain
def get_regr_chain(): # Chain composition first = PrimaryNode(model_type='xgbreg') second = PrimaryNode(model_type='knnreg') final = SecondaryNode(model_type='linear', nodes_from=[first, second]) chain = Chain() chain.add_node(final) return chain
def chain_with_self_cycle(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) second.nodes_from.append(second) chain = Chain() chain.add_node(first) chain.add_node(second) return chain
def get_class_chain(): # Chain composition first = PrimaryNode(model_type='xgboost') second = PrimaryNode(model_type='knn') final = SecondaryNode(model_type='logit', nodes_from=[first, second]) chain = Chain() chain.add_node(final) return chain
def chain_with_secondary_nodes_only(): first = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[]) second = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) chain = Chain() chain.add_node(first) chain.add_node(second) return chain
def two_level_chain(): first = PrimaryNode(model_type='logit') second = PrimaryNode(model_type='knn') third = SecondaryNode(model_type='xgboost', nodes_from=[first, second]) chain = Chain() for node in [first, second, third]: chain.add_node(node) return chain
def valid_chain(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[second]) last = SecondaryNode(model_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, last]: chain.add_node(node) return chain
def chain_third(): # QDA # | \ # RF RF chain = Chain() new_node = NodeGenerator.secondary_node(ModelTypesIdsEnum.qda) for model_type in (ModelTypesIdsEnum.rf, ModelTypesIdsEnum.rf): new_node.nodes_from.append(NodeGenerator.primary_node(model_type)) chain.add_node(new_node) [chain.add_node(node_from) for node_from in new_node.nodes_from] return chain
def chain_with_isolated_components(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[]) fourth = SecondaryNode(model_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, fourth]: chain.add_node(node) return chain
def baseline_chain(): chain = Chain() last_node = NodeGenerator.secondary_node( model_type=ModelTypesIdsEnum.xgboost, nodes_from=[]) for requirement_model in [ModelTypesIdsEnum.knn, ModelTypesIdsEnum.logit]: new_node = NodeGenerator.primary_node(requirement_model) chain.add_node(new_node) last_node.nodes_from.append(new_node) chain.add_node(last_node) return chain
def chain_with_self_cycle(): first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) second.nodes_from.append(second) chain = Chain() chain.add_node(first) chain.add_node(second) return chain
def chain_with_isolated_nodes(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[second]) isolated = SecondaryNode(model_type='logit', nodes_from=[]) chain = Chain() for node in [first, second, third, isolated]: chain.add_node(node) return chain
def chain_third(): # QDA # | \ # RF RF chain = Chain() new_node = SecondaryNode('qda') for model_type in ('rf', 'rf'): new_node.nodes_from.append(PrimaryNode(model_type)) chain.add_node(new_node) [chain.add_node(node_from) for node_from in new_node.nodes_from] return chain
def compose_chain() -> Chain: chain = Chain() node_first = PrimaryNode('svc') node_second = PrimaryNode('lda') node_third = SecondaryNode('rf') node_third.nodes_from.append(node_first) node_third.nodes_from.append(node_second) chain.add_node(node_third) return chain
def chain_with_multiple_roots(): first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) root_first = NodeGenerator.secondary_node( model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) root_second = NodeGenerator.secondary_node( model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) chain = Chain() for node in [first, root_first, root_second]: chain.add_node(node) return chain
def chain_with_cycle(): first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) third = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[second, first]) second.nodes_from.append(third) chain = Chain() for node in [first, second, third]: chain.add_node(node) return chain
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup): data = data_setup train, test = train_test_data_setup(data) first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.lda) third = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.knn) final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.xgboost, nodes_from=[first, second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) first = deepcopy(first) second = deepcopy(second) third = deepcopy(third) final_shuffled = NodeGenerator.secondary_node( model_type=ModelTypesIdsEnum.xgboost, nodes_from=[third, first, second]) chain_shuffled = Chain() # change order of nodes in list for node in [final_shuffled, third, first, second]: chain_shuffled.add_node(node) train_predicted = chain.fit(input_data=train) train_predicted_shuffled = chain_shuffled.fit(input_data=train) # train results should be invariant assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id assert all( np.equal(train_predicted.predict, train_predicted_shuffled.predict)) test_predicted = chain.predict(input_data=test) test_predicted_shuffled = chain_shuffled.predict(input_data=test) # predict results should be invariant assert all( np.equal(test_predicted.predict, test_predicted_shuffled.predict)) # change parents order for the nodes fitted chain nodes_for_change = chain.nodes[3].nodes_from chain.nodes[3].nodes_from = [ nodes_for_change[2], nodes_for_change[0], nodes_for_change[1] ] chain.nodes[3].cache.clear() chain.fit(train) test_predicted_re_shuffled = chain.predict(input_data=test) # predict results should be invariant assert all( np.equal(test_predicted.predict, test_predicted_re_shuffled.predict))
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain = get_composite_lstm_chain() chain_simple = Chain() node_single = PrimaryNode('ridge') chain_simple.add_node(node_single) chain_lstm = Chain() node_lstm = PrimaryNode('lstm') chain_lstm.add_node(node_lstm) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise) chain_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise) print(f'RMSE composite: {rmse_on_valid}') print(f'RMSE simple: {rmse_on_valid_simple}') print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def get_composite_lstm_chain(): chain = Chain() node_trend = PrimaryNode('trend_data_model') node_lstm_trend = SecondaryNode('lasso', nodes_from=[node_trend]) node_residual = PrimaryNode('residual_data_model') node_ridge_residual = SecondaryNode('ridge', nodes_from=[node_residual]) node_final = SecondaryNode( 'additive_data_model', nodes_from=[node_ridge_residual, node_lstm_trend]) chain.add_node(node_final) return chain
def compose_chain(self, data: InputData, initial_chain: Optional[Chain], composer_requirements: ComposerRequirements, metrics: Optional[Callable], optimiser_parameters=None, is_visualise: bool = False) -> Chain: new_chain = Chain() if self.dummy_chain_type == DummyChainTypeEnum.hierarchical: # (y1, y2) -> y last_node = NodeGenerator.secondary_node( composer_requirements.secondary[0]) for requirement_model in composer_requirements.primary: new_node = NodeGenerator.primary_node(requirement_model) new_chain.add_node(new_node) last_node.nodes_from.append(new_node) new_chain.add_node(last_node) elif self.dummy_chain_type == DummyChainTypeEnum.flat: # (y1) -> (y2) -> y first_node = NodeGenerator.primary_node( composer_requirements.primary[0]) new_chain.add_node(first_node) prev_node = first_node for requirement_model in composer_requirements.secondary: new_node = NodeGenerator.secondary_node(requirement_model) new_node.nodes_from = [prev_node] prev_node = new_node new_chain.add_node(new_node) else: raise NotImplementedError() return new_chain
def chain_with_isolated_nodes(): first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) third = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[second]) isolated = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[]) chain = Chain() for node in [first, second, third, isolated]: chain.add_node(node) return chain
def default_valid_chain(): first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) third = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) return chain
def get_composite_lstm_chain(): chain = Chain() node_trend = PrimaryNode('trend_data_model') node_trend.labels = ["fixed"] node_lstm_trend = SecondaryNode('linear', nodes_from=[node_trend]) node_trend.labels = ["fixed"] node_residual = PrimaryNode('residual_data_model') node_ridge_residual = SecondaryNode('linear', nodes_from=[node_residual]) node_final = SecondaryNode( 'additive_data_model', nodes_from=[node_ridge_residual, node_lstm_trend]) node_final.labels = ["fixed"] chain.add_node(node_final) return chain
def test_regression_chain_fit_correct(): data = get_synthetic_ts_data() chain = Chain() node_rfr = PrimaryNode('rfr') chain.add_node(node_rfr) train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) _, rmse_on_test = get_rmse_value(chain, train_data, test_data) rmse_threshold = np.std(data.target) * 1.5 assert rmse_on_test < rmse_threshold
def chain_third(): # XG # | | \ # KNN LDA KNN root_of_tree = NodeGenerator.secondary_node(ModelTypesIdsEnum.xgboost) for model_type in (ModelTypesIdsEnum.knn, ModelTypesIdsEnum.lda, ModelTypesIdsEnum.knn): root_of_tree.nodes_from.append(NodeGenerator.primary_node(model_type)) chain = Chain() for node in root_of_tree.nodes_from: chain.add_node(node) chain.add_node(root_of_tree) return chain