Ejemplo n.º 1
0
def add_all_tasks_to_dvc_pipeline(
        bohr_repo: Optional[BohrRepo] = None,
        path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    create_directories_if_necessary(bohr_repo)
    all_tasks = sorted(bohr_repo.tasks.values(), key=lambda x: x.name)
    logger.info(
        f"Following tasks are added to the pipeline: {list(map(lambda x: x.name, all_tasks))}"
    )

    all_keys = set()
    for keys in map(lambda t: t.datasets.keys(), all_tasks):
        all_keys.update(keys)
    all_datasets_used_in_tasks = list(
        map(lambda key: bohr_repo.datasets[key], all_keys))
    logger.info(f"Datasets used in tasks:")
    for dataset in all_datasets_used_in_tasks:
        linked_datasets = dataset.get_linked_datasets()
        logger.info(
            f"{dataset.name} {'-> ' + str(list(map(lambda d: d.name, linked_datasets))) if linked_datasets else ''}"
        )
    transient_stages = []
    commands: List[DvcCommand] = []
    for dataset_name, dataset in bohr_repo.datasets.items():
        if dataset.preprocessor == "copy":
            copy_command = PreprocessCopyCommand(path_config, dataset)
            commands.append(copy_command)
            transient_stages.append(copy_command.get_name())
        elif dataset.preprocessor == "7z":
            extract_command = Preprocess7zCommand(path_config, dataset)
            commands.append(extract_command)
            transient_stages.append(extract_command.get_name())
        else:
            commands.append(PreprocessShellCommand(path_config, dataset))
    commands.append(ParseLabelsCommand(path_config))
    for task in all_tasks:
        for heuristic_group in task.heuristic_groups:
            for dataset_name, dataset in task.datasets.items():
                datasets = [dataset_name] + list(
                    map(lambda d: d.name, dataset.get_linked_datasets()))
                commands.append(
                    ApplyHeuristicsCommand(path_config, task, heuristic_group,
                                           datasets))
        commands.append(CombineHeuristicsCommand(path_config, task))
        commands.append(TrainLabelModelCommand(path_config, task))
        for dataset_name in task.datasets:
            commands.append(
                LabelDatasetCommand(path_config, task, dataset_name))
    if path_config.manual_stages.exists():
        root, dirs, files = next(os.walk(path_config.manual_stages))
        for file in files:
            commands.append(ManualCommand(path_config, Path(root) / file))
    for command in commands:
        completed_process = command.run()
        if completed_process.returncode != 0:
            print(completed_process.stderr.decode())
            break
    save_transient_stages_to_config(transient_stages, path_config)
Ejemplo n.º 2
0
def status(bohr_repo: Optional[BohrRepo] = None,
           path_config: Optional[PathConfig] = None) -> str:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)
    bohr_repo.dump(path_config.project_root)
    refresh_if_necessary(path_config)
    return dvc.status(path_config)
Ejemplo n.º 3
0
def label_dataset(task: str, dataset: str, debug: bool):
    from bohr.pipeline.label_dataset import label_dataset

    setup_loggers()
    bohr_repo = load_bohr_repo()
    task = bohr_repo.tasks[task]
    dataset = bohr_repo.datasets[dataset]
    label_dataset(task, dataset, debug=debug)
Ejemplo n.º 4
0
def add(
    path: Path,
    artifact: str,
    name: Optional[str] = None,
    author: Optional[str] = None,
    description: Optional[str] = "",
    format: Optional[str] = None,
    preprocessor: Optional[str] = None,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> Dataset:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)
    destination_path = path_config.downloaded_data / path.name
    logger.info(f"Copying {path.name} to {destination_path} ...")
    shutil.copy(path, destination_path)
    dvc_output = dvc.add(destination_path, path_config.project_root)
    logger.info(dvc_output)
    file_name = path.name
    if preprocessor is None:
        file_name, preprocessor = extract_preprocessor_from_file_name(
            file_name)
    if format is None:
        file_name, format = extract_format_from_file_name(file_name)
    dataset_name = name or file_name
    if dataset_name in bohr_repo.datasets:
        message = f"Dataset with name {dataset_name} already exists."
        if name is None:
            message += (
                "\nAre you trying to add the same dataset twice?\n"
                "If not, please specifying the `name` parameter explicitly.")
        raise ValueError(message)
    try:
        mapper = default_mappers[artifact_map[artifact]]
    except KeyError:
        mapper = load_class_by_full_path(artifact)
    path_preprocessed: RelativePath = get_preprocessed_path(
        None,
        relative_to_safe(destination_path, path_config.downloaded_data),
        path_config.data_dir,
        preprocessor,
    )
    dataset = Dataset(
        dataset_name,
        author,
        description,
        path_preprocessed=path_preprocessed,
        path_dist=path_config.downloaded_data_dir / path.name,
        dataloader=CsvDatasetLoader(path_preprocessed, mapper()),
        preprocessor=preprocessor,
    )
    bohr_repo.datasets[dataset.name] = dataset
    bohr_repo.dump(path_config.project_root)
    repro(bohr_repo=bohr_repo, path_config=path_config)
    return dataset
Ejemplo n.º 5
0
def create_directories_if_necessary(
        bohr_repo: Optional[BohrRepo] = None) -> None:
    bohr_repo = bohr_repo or load_bohr_repo()
    path_config = PathConfig.load()
    for task in bohr_repo.tasks.values():
        for heuristic_group in task.heuristic_groups:
            (path_config.generated / task.name / heuristic_group).mkdir(
                exist_ok=True, parents=True)
            (path_config.metrics / task.name / heuristic_group).mkdir(
                exist_ok=True, parents=True)
    path_config.labeled_data.mkdir(exist_ok=True, parents=True)
Ejemplo n.º 6
0
def add(
    name: str,
    artifact: str,
    labels: str,
    label_column: str,
    authors: str,
    description: str,
    use_all_datasets: bool,
    repro: bool,
    force: bool,
    verbose: bool,
) -> None:
    with verbosity(verbose):
        project_root = find_project_root()
        bohr_repo = load_bohr_repo(project_root)
        path_config = PathConfig.load(project_root)
        if name in bohr_repo.tasks and not force:
            logger.error(f"Task {name} is already defined")
            exit(400)
        try:
            artifact_type = artifact_map[artifact]
        except KeyError:
            logger.error(f"Artifact not found: {artifact}")
            exit(404)
        label_list = list(map(lambda s: s.strip(), labels.split(",")))
        if not use_all_datasets:
            train_datasets, test_datasets = {}, {}
        else:
            all_datasets = {
                n: d
                for n, d in bohr_repo.datasets.items()
                if d.artifact_type == artifact_type
            }
            train_datasets, test_datasets = train_and_test(all_datasets, label_column)
        heuristic_groups = get_heuristic_module_list(
            artifact_type, path_config.heuristics
        )
        task = Task(
            name,
            authors,
            description,
            artifact_type,
            label_list,
            train_datasets,
            test_datasets,
            label_column,
            heuristic_groups,
        )
        bohr_repo.tasks[name] = task
        bohr_repo.dump(project_root)
        if repro:
            logger.info("Re-running the pipeline ...")
            api.repro(name, bohr_repo=bohr_repo)
Ejemplo n.º 7
0
def train_label_model(task: str, target_dataset: str):
    from bohr.pipeline.train_label_model import train_label_model

    setup_loggers()
    bohr_repo = load_bohr_repo()
    path_config = PathConfig.load()
    task = bohr_repo.tasks[task]
    target_dataset = bohr_repo.datasets[target_dataset]
    stats = train_label_model(task, target_dataset, path_config)
    with open(path_config.metrics / task.name / "label_model_metrics.json",
              "w") as f:
        json.dump(stats, f)
    pprint(stats)
Ejemplo n.º 8
0
def add_dataset(task: str, dataset: str, repro: bool) -> None:
    bohr_repo = load_bohr_repo()
    if task not in bohr_repo.tasks:
        logger.error(f"Task {task} is not defined")
        exit(404)
    if dataset not in bohr_repo.datasets:
        logger.error(f"Dataset {dataset} is not defined")
        exit(404)
    dataset = api.add_dataset(
        bohr_repo.tasks[task], bohr_repo.datasets[dataset], bohr_repo
    )
    print(f"Dataset {dataset} is added to the task {task}.")
    if repro:
        logger.info("Re-running the pipeline ...")
        api.repro(task, bohr_repo=bohr_repo)
Ejemplo n.º 9
0
def apply_heuristics(task: str, heuristic_group: Optional[str],
                     dataset: Optional[str], profile: bool):
    from bohr.pipeline.apply_heuristics import apply_heuristics
    from bohr.pipeline.combine_heuristics import combine_applied_heuristics

    setup_loggers()
    bohr_repo = load_bohr_repo()

    task = bohr_repo.tasks[task]
    if heuristic_group:
        with Profiler(enabled=profile):
            dataset = bohr_repo.datasets[dataset]
            apply_heuristics(task, heuristic_group, dataset)
    else:
        combine_applied_heuristics(task)
Ejemplo n.º 10
0
def add_dataset(
    task: Task,
    dataset: Dataset,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> Dataset:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    is_test_set = dataset.is_column_present(task.label_column_name)
    logger.info(
        f'Adding dataset {dataset.name} as a {"test" if is_test_set else "train"} set'
    )
    task.add_dataset(dataset, is_test_set)
    bohr_repo.dump(path_config.project_root)
    return dataset
Ejemplo n.º 11
0
def pull(
    task: str,
    target: str,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> RelativePath:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    path = path_config.labeled_data_dir / task / f"{target}.labeled.csv"
    if path.exists():
        logger.info(dvc.pull([str(path)]))
        return path
    else:
        raise BohrDatasetNotFound(
            f"Dataset {target} in task {task} not found! Available datasets in this task: {list(bohr_repo.tasks[task].datasets.keys())}"
        )
Ejemplo n.º 12
0
def ls(task: Optional[str], extended_list: bool) -> None:
    bohr_repo = load_bohr_repo()
    if task:
        if task not in bohr_repo.tasks:
            logger.error(f"Task not found in the config: {task}. \n"
                         f"Defined tasks: {list(bohr_repo.tasks.keys())}")
            exit(404)
        datasets = bohr_repo.tasks[task].datasets
    else:
        datasets = bohr_repo.datasets
    if extended_list:
        print(
            tabulate(
                [[dataset_name,
                  textwrap.fill(dataset.description)]
                 for dataset_name, dataset in datasets.items()],
                tablefmt="fancy_grid",
            ))
    else:
        for dataset in datasets:
            print(dataset)
Ejemplo n.º 13
0
def repro(
    task: Optional[str] = None,
    only_transient: bool = False,
    force: bool = False,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> None:
    """
    # >>> import tempfile
    # >>> with tempfile.TemporaryDirectory() as tmpdirname:
    # ...     with open(Path(tmpdirname) / 'bohr.json', 'w') as f:
    # ...         print(f.write('{"bohr_framework_version": "0.3.9-rc", "tasks": {}, "datasets": {}}'))
    # ...     get_dvc_commands_to_repro(None, False, load_config(Path(tmpdirname)))
    """
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    refresh_if_necessary(path_config)

    paths_to_pull = [str(d.path_dist) for d in bohr_repo.datasets.values()]
    if len(paths_to_pull) > 0:
        logger.info(dvc.pull(paths_to_pull))

    # TODO run only task-related transient stages if task is passed:
    transient_stages = load_transient_stages(path_config)
    if len(transient_stages) > 0:
        logger.info(
            dvc.repro(transient_stages, force=force, path_config=path_config))

    if not only_transient:
        glob = None
        if task:
            if task not in bohr_repo.tasks:
                raise ValueError(f"Task {task} not found in bohr.json")
            glob = f"{task}_*"
        logger.info(
            dvc.repro(pull=True,
                      glob=glob,
                      force=force,
                      path_config=path_config))
Ejemplo n.º 14
0
        label_model.mu.cpu().detach().numpy().reshape(-1, 4),
        columns=["00", "01", "10", "11"],
        index=lines_train.columns,
    )
    df.to_csv(label_model_weights_file, index_label="heuristic_name")

    stats = {}
    for test_set_name, test_set in task._test_datasets.items():
        df = test_set.load()
        if task.label_column_name not in df.columns:
            raise GroundTruthColumnNotFound(
                f"Dataset {test_set_name} is added as a test set to the {task.name} task.\n"
                f"However, column with ground-thruth labels '{task.label_column_name}' not found."
            )
        stats.update(
            calculate_metrics(
                label_model,
                test_set_name,
                df[task.label_column_name].values,
                save_to=task_dir_generated,
            ))

    return stats


if __name__ == "__main__":
    bohr_repo = load_bohr_repo()
    task = bohr_repo.tasks["bugginess"]
    dataset = bohr_repo.datasets["berger"]
    train_label_model(task, dataset)