def add_all_tasks_to_dvc_pipeline( bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None) -> None: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) create_directories_if_necessary(bohr_repo) all_tasks = sorted(bohr_repo.tasks.values(), key=lambda x: x.name) logger.info( f"Following tasks are added to the pipeline: {list(map(lambda x: x.name, all_tasks))}" ) all_keys = set() for keys in map(lambda t: t.datasets.keys(), all_tasks): all_keys.update(keys) all_datasets_used_in_tasks = list( map(lambda key: bohr_repo.datasets[key], all_keys)) logger.info(f"Datasets used in tasks:") for dataset in all_datasets_used_in_tasks: linked_datasets = dataset.get_linked_datasets() logger.info( f"{dataset.name} {'-> ' + str(list(map(lambda d: d.name, linked_datasets))) if linked_datasets else ''}" ) transient_stages = [] commands: List[DvcCommand] = [] for dataset_name, dataset in bohr_repo.datasets.items(): if dataset.preprocessor == "copy": copy_command = PreprocessCopyCommand(path_config, dataset) commands.append(copy_command) transient_stages.append(copy_command.get_name()) elif dataset.preprocessor == "7z": extract_command = Preprocess7zCommand(path_config, dataset) commands.append(extract_command) transient_stages.append(extract_command.get_name()) else: commands.append(PreprocessShellCommand(path_config, dataset)) commands.append(ParseLabelsCommand(path_config)) for task in all_tasks: for heuristic_group in task.heuristic_groups: for dataset_name, dataset in task.datasets.items(): datasets = [dataset_name] + list( map(lambda d: d.name, dataset.get_linked_datasets())) commands.append( ApplyHeuristicsCommand(path_config, task, heuristic_group, datasets)) commands.append(CombineHeuristicsCommand(path_config, task)) commands.append(TrainLabelModelCommand(path_config, task)) for dataset_name in task.datasets: commands.append( LabelDatasetCommand(path_config, task, dataset_name)) if path_config.manual_stages.exists(): root, dirs, files = next(os.walk(path_config.manual_stages)) for file in files: commands.append(ManualCommand(path_config, Path(root) / file)) for command in commands: completed_process = command.run() if completed_process.returncode != 0: print(completed_process.stderr.decode()) break save_transient_stages_to_config(transient_stages, path_config)
def status(bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None) -> str: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) bohr_repo.dump(path_config.project_root) refresh_if_necessary(path_config) return dvc.status(path_config)
def label_dataset(task: str, dataset: str, debug: bool): from bohr.pipeline.label_dataset import label_dataset setup_loggers() bohr_repo = load_bohr_repo() task = bohr_repo.tasks[task] dataset = bohr_repo.datasets[dataset] label_dataset(task, dataset, debug=debug)
def add( path: Path, artifact: str, name: Optional[str] = None, author: Optional[str] = None, description: Optional[str] = "", format: Optional[str] = None, preprocessor: Optional[str] = None, bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None, ) -> Dataset: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) destination_path = path_config.downloaded_data / path.name logger.info(f"Copying {path.name} to {destination_path} ...") shutil.copy(path, destination_path) dvc_output = dvc.add(destination_path, path_config.project_root) logger.info(dvc_output) file_name = path.name if preprocessor is None: file_name, preprocessor = extract_preprocessor_from_file_name( file_name) if format is None: file_name, format = extract_format_from_file_name(file_name) dataset_name = name or file_name if dataset_name in bohr_repo.datasets: message = f"Dataset with name {dataset_name} already exists." if name is None: message += ( "\nAre you trying to add the same dataset twice?\n" "If not, please specifying the `name` parameter explicitly.") raise ValueError(message) try: mapper = default_mappers[artifact_map[artifact]] except KeyError: mapper = load_class_by_full_path(artifact) path_preprocessed: RelativePath = get_preprocessed_path( None, relative_to_safe(destination_path, path_config.downloaded_data), path_config.data_dir, preprocessor, ) dataset = Dataset( dataset_name, author, description, path_preprocessed=path_preprocessed, path_dist=path_config.downloaded_data_dir / path.name, dataloader=CsvDatasetLoader(path_preprocessed, mapper()), preprocessor=preprocessor, ) bohr_repo.datasets[dataset.name] = dataset bohr_repo.dump(path_config.project_root) repro(bohr_repo=bohr_repo, path_config=path_config) return dataset
def create_directories_if_necessary( bohr_repo: Optional[BohrRepo] = None) -> None: bohr_repo = bohr_repo or load_bohr_repo() path_config = PathConfig.load() for task in bohr_repo.tasks.values(): for heuristic_group in task.heuristic_groups: (path_config.generated / task.name / heuristic_group).mkdir( exist_ok=True, parents=True) (path_config.metrics / task.name / heuristic_group).mkdir( exist_ok=True, parents=True) path_config.labeled_data.mkdir(exist_ok=True, parents=True)
def add( name: str, artifact: str, labels: str, label_column: str, authors: str, description: str, use_all_datasets: bool, repro: bool, force: bool, verbose: bool, ) -> None: with verbosity(verbose): project_root = find_project_root() bohr_repo = load_bohr_repo(project_root) path_config = PathConfig.load(project_root) if name in bohr_repo.tasks and not force: logger.error(f"Task {name} is already defined") exit(400) try: artifact_type = artifact_map[artifact] except KeyError: logger.error(f"Artifact not found: {artifact}") exit(404) label_list = list(map(lambda s: s.strip(), labels.split(","))) if not use_all_datasets: train_datasets, test_datasets = {}, {} else: all_datasets = { n: d for n, d in bohr_repo.datasets.items() if d.artifact_type == artifact_type } train_datasets, test_datasets = train_and_test(all_datasets, label_column) heuristic_groups = get_heuristic_module_list( artifact_type, path_config.heuristics ) task = Task( name, authors, description, artifact_type, label_list, train_datasets, test_datasets, label_column, heuristic_groups, ) bohr_repo.tasks[name] = task bohr_repo.dump(project_root) if repro: logger.info("Re-running the pipeline ...") api.repro(name, bohr_repo=bohr_repo)
def train_label_model(task: str, target_dataset: str): from bohr.pipeline.train_label_model import train_label_model setup_loggers() bohr_repo = load_bohr_repo() path_config = PathConfig.load() task = bohr_repo.tasks[task] target_dataset = bohr_repo.datasets[target_dataset] stats = train_label_model(task, target_dataset, path_config) with open(path_config.metrics / task.name / "label_model_metrics.json", "w") as f: json.dump(stats, f) pprint(stats)
def add_dataset(task: str, dataset: str, repro: bool) -> None: bohr_repo = load_bohr_repo() if task not in bohr_repo.tasks: logger.error(f"Task {task} is not defined") exit(404) if dataset not in bohr_repo.datasets: logger.error(f"Dataset {dataset} is not defined") exit(404) dataset = api.add_dataset( bohr_repo.tasks[task], bohr_repo.datasets[dataset], bohr_repo ) print(f"Dataset {dataset} is added to the task {task}.") if repro: logger.info("Re-running the pipeline ...") api.repro(task, bohr_repo=bohr_repo)
def apply_heuristics(task: str, heuristic_group: Optional[str], dataset: Optional[str], profile: bool): from bohr.pipeline.apply_heuristics import apply_heuristics from bohr.pipeline.combine_heuristics import combine_applied_heuristics setup_loggers() bohr_repo = load_bohr_repo() task = bohr_repo.tasks[task] if heuristic_group: with Profiler(enabled=profile): dataset = bohr_repo.datasets[dataset] apply_heuristics(task, heuristic_group, dataset) else: combine_applied_heuristics(task)
def add_dataset( task: Task, dataset: Dataset, bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None, ) -> Dataset: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) is_test_set = dataset.is_column_present(task.label_column_name) logger.info( f'Adding dataset {dataset.name} as a {"test" if is_test_set else "train"} set' ) task.add_dataset(dataset, is_test_set) bohr_repo.dump(path_config.project_root) return dataset
def pull( task: str, target: str, bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None, ) -> RelativePath: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) path = path_config.labeled_data_dir / task / f"{target}.labeled.csv" if path.exists(): logger.info(dvc.pull([str(path)])) return path else: raise BohrDatasetNotFound( f"Dataset {target} in task {task} not found! Available datasets in this task: {list(bohr_repo.tasks[task].datasets.keys())}" )
def ls(task: Optional[str], extended_list: bool) -> None: bohr_repo = load_bohr_repo() if task: if task not in bohr_repo.tasks: logger.error(f"Task not found in the config: {task}. \n" f"Defined tasks: {list(bohr_repo.tasks.keys())}") exit(404) datasets = bohr_repo.tasks[task].datasets else: datasets = bohr_repo.datasets if extended_list: print( tabulate( [[dataset_name, textwrap.fill(dataset.description)] for dataset_name, dataset in datasets.items()], tablefmt="fancy_grid", )) else: for dataset in datasets: print(dataset)
def repro( task: Optional[str] = None, only_transient: bool = False, force: bool = False, bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None, ) -> None: """ # >>> import tempfile # >>> with tempfile.TemporaryDirectory() as tmpdirname: # ... with open(Path(tmpdirname) / 'bohr.json', 'w') as f: # ... print(f.write('{"bohr_framework_version": "0.3.9-rc", "tasks": {}, "datasets": {}}')) # ... get_dvc_commands_to_repro(None, False, load_config(Path(tmpdirname))) """ path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) refresh_if_necessary(path_config) paths_to_pull = [str(d.path_dist) for d in bohr_repo.datasets.values()] if len(paths_to_pull) > 0: logger.info(dvc.pull(paths_to_pull)) # TODO run only task-related transient stages if task is passed: transient_stages = load_transient_stages(path_config) if len(transient_stages) > 0: logger.info( dvc.repro(transient_stages, force=force, path_config=path_config)) if not only_transient: glob = None if task: if task not in bohr_repo.tasks: raise ValueError(f"Task {task} not found in bohr.json") glob = f"{task}_*" logger.info( dvc.repro(pull=True, glob=glob, force=force, path_config=path_config))
label_model.mu.cpu().detach().numpy().reshape(-1, 4), columns=["00", "01", "10", "11"], index=lines_train.columns, ) df.to_csv(label_model_weights_file, index_label="heuristic_name") stats = {} for test_set_name, test_set in task._test_datasets.items(): df = test_set.load() if task.label_column_name not in df.columns: raise GroundTruthColumnNotFound( f"Dataset {test_set_name} is added as a test set to the {task.name} task.\n" f"However, column with ground-thruth labels '{task.label_column_name}' not found." ) stats.update( calculate_metrics( label_model, test_set_name, df[task.label_column_name].values, save_to=task_dir_generated, )) return stats if __name__ == "__main__": bohr_repo = load_bohr_repo() task = bohr_repo.tasks["bugginess"] dataset = bohr_repo.datasets["berger"] train_label_model(task, dataset)