def test_pipelinesearcher(makedirs_mock): # static methods assert hasattr(PipelineSearcher, '_find_datasets') assert hasattr(PipelineSearcher, '_new_pipeline') # default parameters instance = PipelineSearcher() makedirs_mock.assert_called_with(instance.ranked_dir, exist_ok=True) assert instance.input == 'input' assert instance.output == 'output' assert not instance.dump assert instance.ranked_dir == '{}/pipelines_ranked'.format(instance.output) assert isinstance(instance.data_pipeline, Pipeline) assert isinstance(instance.scoring_pipeline, Pipeline) # other parameters instance = PipelineSearcher(input_dir='new-input', output_dir='new-output', dump=True) makedirs_mock.assert_called_with(instance.ranked_dir, exist_ok=True) assert instance.input == 'new-input' assert instance.output == 'new-output' assert instance.dump assert instance.ranked_dir == '{}/pipelines_ranked'.format(instance.output) assert isinstance(instance.data_pipeline, Pipeline) assert isinstance(instance.scoring_pipeline, Pipeline) assert instance.datasets == {}
def test_pipelinesearcher_stop(): instance = PipelineSearcher() assert not hasattr(instance, '_stop') # setting _stop instance.stop() assert instance._stop
def search(dataset_root, problem, args): pps = PipelineSearcher( args.input, args.output, args.static, dump=True, hard_timeout=args.hard, ) return pps.search(problem, args.timeout, args.budget, args.template)
def test_pipelinesearcher_setup_search(): instance = PipelineSearcher() assert hasattr(instance, 'solutions') assert not hasattr(instance, '_stop') assert not hasattr(instance, 'done') assert not hasattr(instance, 'start_time') assert not hasattr(instance, 'timeout') assert not hasattr(instance, 'max_end_time') # without timeout instance.timeout = None instance.setup_search() assert instance.solutions == [] assert instance._stop is False assert instance.done is False assert hasattr(instance, 'start_time') assert instance.timeout is None assert instance.max_end_time is None # with timeout instance.timeout = 0.5 instance.setup_search() assert instance.timeout == 0.5 assert instance.max_end_time == instance.start_time + timedelta(seconds=0.5)
def test_pipelinesearcher_load_pipeline(json_loader_mock, yaml_loader_mock): instance = PipelineSearcher() open_mock = mock_open(read_data='data') json_loader_mock.reset_mock() yaml_loader_mock.reset_mock() # yaml file with patch('ta2.search.open', open_mock) as _: instance._load_pipeline('test.yml') open_mock.assert_called_with('{}/test.yml'.format(PIPELINES_DIR), 'r') assert yaml_loader_mock.call_count == 1 assert json_loader_mock.call_count == 0 # json file with patch('ta2.search.open', open_mock) as _: instance._load_pipeline('test.json') open_mock.assert_called_with('{}/test.json'.format(PIPELINES_DIR), 'r') assert yaml_loader_mock.call_count == 1 assert json_loader_mock.call_count == 1 # without file extension with patch('ta2.search.open', open_mock) as _: instance._load_pipeline('test') open_mock.assert_called_with('{}/test.json'.format(PIPELINES_DIR), 'r') assert yaml_loader_mock.call_count == 1 assert json_loader_mock.call_count == 2
def test_pipelinesearcher_find_datasets(tmp_path): input_dir = tmp_path / 'test-input' input_dir.mkdir() content = { 'about': { 'datasetID': None } } num_datasets = 3 for i in range(num_datasets): dataset_dir = input_dir / 'dataset-{}'.format(i) dataset_dir.mkdir() content['about']['datasetID'] = 'dataset-{}'.format(i) file = dataset_dir / 'datasetDoc.json' file.write_text(json.dumps(content)) result = PipelineSearcher._find_datasets(input_dir) assert len(result) == num_datasets for i in range(num_datasets): dataset_id = 'dataset-{}'.format(i) assert dataset_id in result assert result[dataset_id] == 'file://{}/{}/datasetDoc.json'.format(input_dir, dataset_id)
def test_pipelinesearcher_get_template(logger_mock): instance = PipelineSearcher() data = {'problem': {'task_type': None}} # classification data['problem']['task_type'] = TaskType.CLASSIFICATION result = instance._get_template(None, data) # dataset (None) is not used assert logger_mock.call_count == 1 assert result == 'xgb_classification.all_hp.yml' # regression data['problem']['task_type'] = TaskType.REGRESSION result = instance._get_template(None, data) # dataset (None) is not used assert logger_mock.call_count == 2 assert result == 'xgb_regression.all_hp.yml' # not supported data['problem']['task_type'] = 'other-task-type' with pytest.raises(ValueError): instance._get_template(None, data) # dataset (None) is not used
def test_pipelinesearcher_defaults(makedirs_mock): instance = PipelineSearcher() expected_calls = [ call('output/pipelines_ranked', exist_ok=True), call('output/pipelines_scored', exist_ok=True), call('output/pipelines_searched', exist_ok=True), ] assert makedirs_mock.call_args_list == expected_calls assert instance.input == 'input' assert instance.output == 'output' assert not instance.dump assert instance.ranked_dir == 'output/pipelines_ranked' assert instance.scored_dir == 'output/pipelines_scored' assert instance.searched_dir == 'output/pipelines_searched' assert isinstance(instance.data_pipeline, Pipeline) assert isinstance(instance.scoring_pipeline, Pipeline)
def test_pipelinesearcher(makedirs_mock, from_yaml_mock): instance = PipelineSearcher(input_dir='new-input', output_dir='new-output', dump=True) expected_calls = [ call('new-output/pipeline_runs', exist_ok=True), call('new-output/pipelines_ranked', exist_ok=True), call('new-output/pipelines_scored', exist_ok=True), call('new-output/pipelines_searched', exist_ok=True), ] assert makedirs_mock.call_args_list == expected_calls assert instance.input == 'new-input' assert instance.output == 'new-output' assert instance.dump assert instance.ranked_dir == 'new-output/pipelines_ranked' assert instance.scored_dir == 'new-output/pipelines_scored' assert instance.searched_dir == 'new-output/pipelines_searched' assert instance.data_pipeline == from_yaml_mock.return_value assert instance.scoring_pipeline == from_yaml_mock.return_value
def test_pipelinesearcher_save_pipeline(random_mock): id = 'test-id' score = 1.0 random_mock.return_value = 2 pipeline_mock = MagicMock(id=id, score=score) pipeline_mock.to_json_structure = MagicMock(return_value={}) open_mock = mock_open() # avoid saving pipeline on file instance = PipelineSearcher(dump=False) instance.solutions = [ ] # normally, setted in `PipelineSearcher.setup_search` result = instance._save_pipeline( pipeline_mock, None) # normalized_score (None) not used in this case assert result is None assert pipeline_mock.to_json_structure.call_count == 1 assert instance.solutions == [{'score': score}] assert not random_mock.called assert not open_mock.called # saving the pipeline on file (dump = True) instance = PipelineSearcher(dump=True) instance.solutions = [ ] # normally, setted in `PipelineSearcher.setup_search` with patch('ta2.search.open', open_mock) as _: result = instance._save_pipeline(pipeline_mock, 1) assert result is None assert pipeline_mock.to_json_structure.call_count == 2 assert instance.solutions == [{'score': score, 'pipeline_rank': 2.e-12}] assert random_mock.call_count == 1 assert open_mock.call_count == 1 open_mock.assert_called_with('{}/{}.json'.format(instance.ranked_dir, id), 'w')
def test_pipelinesearcher(makedirs_mock): instance = PipelineSearcher(input_dir='new-input', output_dir='new-output', dump=True) expected_calls = [ call('new-output/pipelines_ranked', exist_ok=True), call('new-output/pipelines_scored', exist_ok=True), call('new-output/pipelines_searched', exist_ok=True), ] assert makedirs_mock.call_args_list == expected_calls assert instance.input == 'new-input' assert instance.output == 'new-output' assert instance.dump assert instance.ranked_dir == 'new-output/pipelines_ranked' assert instance.scored_dir == 'new-output/pipelines_scored' assert instance.searched_dir == 'new-output/pipelines_searched' assert isinstance(instance.data_pipeline, Pipeline) assert isinstance(instance.scoring_pipeline, Pipeline) assert instance.datasets == {}
def test_pipelinesearcher_check_stop(datetime_mock): datetime_mock.now = MagicMock(return_value=10) # no stop instance = PipelineSearcher() instance._stop = False # normally, setted in `PipelineSearcher.setup_search` instance.timeout = None # normally, setted in `PipelineSearcher.setup_search` assert instance.check_stop() is None # stop by `_stop` attribute instance._stop = True with pytest.raises(KeyboardInterrupt): instance.check_stop() # stop by `max_end_time` instance._stop = False instance.timeout = 10 instance.max_end_time = 5 with pytest.raises(KeyboardInterrupt): instance.check_stop()
def test_pipelinesearcher_score_pipeline(evaluate_mock): instance = PipelineSearcher() expected_scores = [MagicMock(value=[1])] evaluate_mock.return_value = (expected_scores, expected_scores) # parameters dataset = {} problem = {'problem': {'performance_metrics': None}} pipeline_mock = MagicMock() metrics = {'test': 'metric'} random_seed = 0 folds = 5 stratified = False shuffle = False data_params = { 'number_of_folds': json.dumps(folds), 'stratified': json.dumps(stratified), 'shuffle': json.dumps(shuffle), } # with custom metrics instance.score_pipeline( dataset, problem, pipeline_mock, metrics=metrics, random_seed=random_seed, folds=folds, stratified=stratified, shuffle=shuffle ) evaluate_mock.assert_called_with( pipeline=pipeline_mock, inputs=[dataset], data_pipeline=instance.data_pipeline, scoring_pipeline=instance.scoring_pipeline, problem_description=problem, data_params=data_params, # folds, stratified, shuffle metrics=metrics, # custom metrics context=Context.TESTING, random_seed=random_seed, data_random_seed=random_seed, scoring_random_seed=random_seed, volumes_dir=instance.static ) assert pipeline_mock.cv_scores == [score.value[0] for score in expected_scores] # with problem metrics instance.score_pipeline( dataset, problem, pipeline_mock, metrics=None, random_seed=random_seed, folds=folds, stratified=stratified, shuffle=shuffle ) evaluate_mock.assert_called_with( pipeline=pipeline_mock, inputs=[dataset], data_pipeline=instance.data_pipeline, scoring_pipeline=instance.scoring_pipeline, problem_description=problem, data_params=data_params, # folds, stratified, shuffle metrics=problem['problem']['performance_metrics'], # custom metrics context=Context.TESTING, random_seed=random_seed, data_random_seed=random_seed, scoring_random_seed=random_seed, volumes_dir=instance.static ) assert pipeline_mock.cv_scores == [score.value[0] for score in expected_scores]
def search(dataset_root, problem, args): pps = PipelineSearcher(args.input, args.output, dump=True) return pps.search(problem, timeout=args.timeout, budget=args.budget)
def process_dataset(dataset_name, dataset, problem, args): box_print("Processing dataset {}".format(dataset_name), True) output_path = os.path.join(args.output, dataset_name) os.makedirs(output_path, exist_ok=True) LOGGER.info("Searching Pipeline for dataset {}".format(dataset_name)) try: start_ts = datetime.utcnow() pps = PipelineSearcher(args.input, output_path, args.static, dump=True, hard_timeout=args.hard, ignore_errors=args.ignore_errors, cv_folds=args.folds, subprocess_timeout=args.subprocess_timeout, max_errors=args.max_errors, store_summary=True) result = pps.search(dataset, problem, args.timeout, args.budget, args.templates_csv) result['elapsed'] = datetime.utcnow() - start_ts result['dataset'] = dataset_name except Exception as ex: result = { 'dataset': dataset_name, 'error': '{}: {}'.format(type(ex).__name__, ex), } else: try: summary = result.pop('summary') candidates = _select_candidates(summary) if candidates.empty: box_print('No valid pipelines found for dataset {}'.format( dataset_name)) else: ranked_path = os.path.join(output_path, 'pipelines_ranked') test_scores = list() for _, candidate in candidates.iterrows(): try: pipeline = candidate.pipeline pipeline_path = os.path.join(ranked_path, pipeline) test_score = score_pipeline(dataset, problem, pipeline_path, args.static, output_path) test_scores.append(test_score) except Exception: test_scores.append(None) candidates['test_score'] = test_scores candidates = candidates.sort_values('test_score', ascending=False) best = candidates.iloc[0] result['test_score'] = best.test_score result['template'] = best.template result['cv_score'] = best.score box_print('Best pipelines for dataset {}:\n{}'.format( dataset_name, candidates.to_string())) except Exception as ex: LOGGER.exception('Error while testing the winner pipeline') result['error'] = 'TEST Error: {}'.format(ex) return result