Python PathConfig.loadの例、bohr.config.pathconfig.PathConfig.load Pythonの例

コード例 #1

0

ファイルを表示

ファイル: stages.py プロジェクト: giganticode/bohr-framework

def add_all_tasks_to_dvc_pipeline(
        bohr_repo: Optional[BohrRepo] = None,
        path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    create_directories_if_necessary(bohr_repo)
    all_tasks = sorted(bohr_repo.tasks.values(), key=lambda x: x.name)
    logger.info(
        f"Following tasks are added to the pipeline: {list(map(lambda x: x.name, all_tasks))}"
    )

    all_keys = set()
    for keys in map(lambda t: t.datasets.keys(), all_tasks):
        all_keys.update(keys)
    all_datasets_used_in_tasks = list(
        map(lambda key: bohr_repo.datasets[key], all_keys))
    logger.info(f"Datasets used in tasks:")
    for dataset in all_datasets_used_in_tasks:
        linked_datasets = dataset.get_linked_datasets()
        logger.info(
            f"{dataset.name} {'-> ' + str(list(map(lambda d: d.name, linked_datasets))) if linked_datasets else ''}"
        )
    transient_stages = []
    commands: List[DvcCommand] = []
    for dataset_name, dataset in bohr_repo.datasets.items():
        if dataset.preprocessor == "copy":
            copy_command = PreprocessCopyCommand(path_config, dataset)
            commands.append(copy_command)
            transient_stages.append(copy_command.get_name())
        elif dataset.preprocessor == "7z":
            extract_command = Preprocess7zCommand(path_config, dataset)
            commands.append(extract_command)
            transient_stages.append(extract_command.get_name())
        else:
            commands.append(PreprocessShellCommand(path_config, dataset))
    commands.append(ParseLabelsCommand(path_config))
    for task in all_tasks:
        for heuristic_group in task.heuristic_groups:
            for dataset_name, dataset in task.datasets.items():
                datasets = [dataset_name] + list(
                    map(lambda d: d.name, dataset.get_linked_datasets()))
                commands.append(
                    ApplyHeuristicsCommand(path_config, task, heuristic_group,
                                           datasets))
        commands.append(CombineHeuristicsCommand(path_config, task))
        commands.append(TrainLabelModelCommand(path_config, task))
        for dataset_name in task.datasets:
            commands.append(
                LabelDatasetCommand(path_config, task, dataset_name))
    if path_config.manual_stages.exists():
        root, dirs, files = next(os.walk(path_config.manual_stages))
        for file in files:
            commands.append(ManualCommand(path_config, Path(root) / file))
    for command in commands:
        completed_process = command.run()
        if completed_process.returncode != 0:
            print(completed_process.stderr.decode())
            break
    save_transient_stages_to_config(transient_stages, path_config)

コード例 #2

0

ファイルを表示

def _load_output_matrix_and_weights(
    task_name: str,
    labeled_dataset: str,
    rev: Optional[str] = None,
    force_update: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    path_config = PathConfig.load()
    logging.disable(logging.WARNING)
    repo = (get_path_to_revision(
        path_config.project_root, rev, force_update=force_update)
            if rev is not None else None)
    with dvc.api.open(
            path_config.generated_dir / task_name /
            f"heuristic_matrix_{labeled_dataset}.pkl",
            repo,
            mode="rb",
    ) as f:
        matrix = pd.read_pickle(BytesIO(f.read()))

    with dvc.api.open(
            path_config.generated_dir / task_name / f"label_model_weights.csv",
            repo) as f:
        weights = pd.read_csv(f, index_col="heuristic_name")
    logging.disable(logging.NOTSET)
    return matrix, weights

コード例 #3

0

ファイルを表示

def status(bohr_repo: Optional[BohrRepo] = None,
           path_config: Optional[PathConfig] = None) -> str:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)
    bohr_repo.dump(path_config.project_root)
    refresh_if_necessary(path_config)
    return dvc.status(path_config)

コード例 #4

0

ファイルを表示

ファイル: apply_heuristics.py プロジェクト: giganticode/bohr-framework

def apply_heuristics(
    task: Task,
    heuristic_group: str,
    dataset: Dataset,
    path_config: Optional[PathConfig] = None,
) -> None:

    path_config = path_config or PathConfig.load()
    task_dir_generated, task_dir_metrics = create_dirs_if_necessary(
        task, path_config, heuristic_group=heuristic_group)
    heuristics = load_heuristics_from_module(task.top_artifact,
                                             heuristic_group)
    if not heuristics:
        raise ValueError(
            f"Heuristics not found for artifact: {task.top_artifact}")

    save_to_matrix = task_dir_generated / f"heuristic_matrix_{dataset.name}.pkl"
    save_to_metrics = task_dir_metrics / f"heuristic_metrics_{dataset.name}.json"
    labeling_functions = to_labeling_functions(heuristics, dataset.mapper,
                                               task.labels)
    artifact_df = dataset.load()
    apply_lf_matrix = apply_lfs_to_dataset(labeling_functions,
                                           artifact_df=artifact_df,
                                           save_to=save_to_matrix)
    label_series = (artifact_df[task.label_column_name]
                    if task.label_column_name in artifact_df.columns else None)
    calculate_metrics(apply_lf_matrix,
                      labeling_functions,
                      label_series,
                      save_to=save_to_metrics)

コード例 #5

0

ファイルを表示

ファイル: stages.py プロジェクト: giganticode/bohr-framework

def load_transient_stages(
        path_config: Optional[PathConfig] = None) -> List[str]:
    path_config = path_config or PathConfig.load()
    transient_stages_file = path_config.project_root / ".bohr" / "transient_stages.json"
    if not transient_stages_file.exists():
        return []
    with transient_stages_file.open() as f:
        return json.load(f)

コード例 #6

0

ファイルを表示

 def dump(self, project_root: AbsolutePath) -> None:
     with open(project_root / "bohr.json", "w") as f:
         f.write(
             json.dumps(
                 jsons.dump(
                     self, data_dir=PathConfig.load(project_root).data_dir),
                 indent=2,
             ))

コード例 #7

0

ファイルを表示

ファイル: appconfig.py プロジェクト: giganticode/bohr-framework

 def load(project_root: Optional[AbsolutePath] = None) -> "AppConfig":
     project_root = project_root or find_project_root()
     config_dict = load_config_dict_from_file(project_root)
     try:
         verbose_str = config_dict["core"]["verbose"]
         verbose = verbose_str == "true" or verbose_str == "True"
     except KeyError:
         verbose = False
     return AppConfig(verbose, PathConfig.load())

コード例 #8

0

ファイルを表示

def refresh_if_necessary(path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    if not bohr_up_to_date(path_config):
        logger.info(
            "There are changes to the bohr config. Refreshing the workspace..."
        )
        refresh(path_config)
    else:
        logger.info("Bohr config hasn't changed.")

コード例 #9

0

ファイルを表示

ファイル: combine_heuristics.py プロジェクト: giganticode/bohr-framework

def combine_applied_heuristics(task: Task,
                               path_config: Optional[PathConfig] = None
                               ) -> None:

    path_config = path_config or PathConfig.load()
    task_dir_generated = path_config.generated / task.name
    for dataset_name, dataset in task.datasets.items():
        all_heuristics_file = (task_dir_generated /
                               f"heuristic_matrix_{dataset_name}.pkl")
        matrix_list = []
        all_heuristics = []
        for heuristic_module_path in task.heuristic_groups:
            partial_heuristics_file = (task_dir_generated /
                                       heuristic_module_path /
                                       f"heuristic_matrix_{dataset_name}.pkl")
            matrix = pd.read_pickle(str(partial_heuristics_file))
            matrix_list.append(matrix)
            heuristics = load_heuristics_from_module(task.top_artifact,
                                                     heuristic_module_path)
            all_heuristics.extend(heuristics)
        labeling_functions = to_labeling_functions(all_heuristics,
                                                   dataset.mapper, task.labels)
        all_heuristics_matrix = pd.concat(matrix_list, axis=1)
        if sum(all_heuristics_matrix.columns.duplicated()) != 0:
            s = set()
            for c in all_heuristics_matrix.columns:
                if c in s:
                    raise ValueError(f"Duplicate heuristics are present: {c}")
                s.add(c)
            raise AssertionError()
        all_heuristics_matrix.to_pickle(str(all_heuristics_file))
        artifact_df = dataset.load()
        label_series = (artifact_df[task.label_column_name]
                        if task.label_column_name in artifact_df.columns else
                        None)
        save_csv_to = path_config.generated / task.name / f"analysis_{dataset_name}.csv"
        save_json_to = path_config.metrics / task.name / f"analysis_{dataset_name}.json"
        save_metrics_to = (path_config.metrics / task.name /
                           f"heuristic_metrics_{dataset_name}.json")

        run_analysis(
            all_heuristics_matrix.to_numpy(),
            labeling_functions,
            save_csv_to,
            save_json_to,
            label_series,
        )

        stats = calculate_metrics(
            all_heuristics_matrix.to_numpy(),
            labeling_functions,
            label_series,
            save_to=save_metrics_to,
        )

        pprint(stats)

コード例 #10

0

ファイルを表示

ファイル: stages.py プロジェクト: giganticode/bohr-framework

def save_transient_stages_to_config(
        transient_stages: List[str],
        path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    conf_dir = path_config.project_root / ".bohr"
    if not conf_dir.exists():
        conf_dir.mkdir()
    transient_stages_file = conf_dir / "transient_stages.json"
    with transient_stages_file.open("w") as f:
        json.dump(transient_stages, f)

コード例 #11

0

ファイルを表示

def add(
    path: Path,
    artifact: str,
    name: Optional[str] = None,
    author: Optional[str] = None,
    description: Optional[str] = "",
    format: Optional[str] = None,
    preprocessor: Optional[str] = None,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> Dataset:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)
    destination_path = path_config.downloaded_data / path.name
    logger.info(f"Copying {path.name} to {destination_path} ...")
    shutil.copy(path, destination_path)
    dvc_output = dvc.add(destination_path, path_config.project_root)
    logger.info(dvc_output)
    file_name = path.name
    if preprocessor is None:
        file_name, preprocessor = extract_preprocessor_from_file_name(
            file_name)
    if format is None:
        file_name, format = extract_format_from_file_name(file_name)
    dataset_name = name or file_name
    if dataset_name in bohr_repo.datasets:
        message = f"Dataset with name {dataset_name} already exists."
        if name is None:
            message += (
                "\nAre you trying to add the same dataset twice?\n"
                "If not, please specifying the `name` parameter explicitly.")
        raise ValueError(message)
    try:
        mapper = default_mappers[artifact_map[artifact]]
    except KeyError:
        mapper = load_class_by_full_path(artifact)
    path_preprocessed: RelativePath = get_preprocessed_path(
        None,
        relative_to_safe(destination_path, path_config.downloaded_data),
        path_config.data_dir,
        preprocessor,
    )
    dataset = Dataset(
        dataset_name,
        author,
        description,
        path_preprocessed=path_preprocessed,
        path_dist=path_config.downloaded_data_dir / path.name,
        dataloader=CsvDatasetLoader(path_preprocessed, mapper()),
        preprocessor=preprocessor,
    )
    bohr_repo.datasets[dataset.name] = dataset
    bohr_repo.dump(path_config.project_root)
    repro(bohr_repo=bohr_repo, path_config=path_config)
    return dataset

コード例 #12

0

ファイルを表示

ファイル: stages.py プロジェクト: giganticode/bohr-framework

def create_directories_if_necessary(
        bohr_repo: Optional[BohrRepo] = None) -> None:
    bohr_repo = bohr_repo or load_bohr_repo()
    path_config = PathConfig.load()
    for task in bohr_repo.tasks.values():
        for heuristic_group in task.heuristic_groups:
            (path_config.generated / task.name / heuristic_group).mkdir(
                exist_ok=True, parents=True)
            (path_config.metrics / task.name / heuristic_group).mkdir(
                exist_ok=True, parents=True)
    path_config.labeled_data.mkdir(exist_ok=True, parents=True)

コード例 #13

0

ファイルを表示

ファイル: cli.py プロジェクト: giganticode/bohr-framework

def pull(task: str, target: str, verbose: bool = False):
    try:
        with verbosity(verbose):
            path_config = PathConfig.load()
            refresh_if_necessary(path_config)
            path = api.pull(task, target, path_config=path_config)
            logger.info(
                f"The dataset is available at {path_config.project_root / path}"
            )
    except BohrDatasetNotFound as ex:
        logger.error(ex, exc_info=logger.getEffectiveLevel() == logging.DEBUG)
        exit(404)

コード例 #14

0

ファイルを表示

def parse_labels(path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    label_tree_list = load_label_tree(path_config.labels)
    from jinja2 import Environment

    env = Environment(loader=FileSystemLoader(Path(__file__).parent.parent))
    template = env.get_template("resources/labels.template")
    s = template.render(hierarchies=[
        l for label_tree in label_tree_list for l in label_tree.flatten()
    ])
    with open("labels.py", "w") as f:
        f.write(s)

コード例 #15

0

ファイルを表示

ファイル: commands.py プロジェクト: giganticode/bohr-framework

def add(
    name: str,
    artifact: str,
    labels: str,
    label_column: str,
    authors: str,
    description: str,
    use_all_datasets: bool,
    repro: bool,
    force: bool,
    verbose: bool,
) -> None:
    with verbosity(verbose):
        project_root = find_project_root()
        bohr_repo = load_bohr_repo(project_root)
        path_config = PathConfig.load(project_root)
        if name in bohr_repo.tasks and not force:
            logger.error(f"Task {name} is already defined")
            exit(400)
        try:
            artifact_type = artifact_map[artifact]
        except KeyError:
            logger.error(f"Artifact not found: {artifact}")
            exit(404)
        label_list = list(map(lambda s: s.strip(), labels.split(",")))
        if not use_all_datasets:
            train_datasets, test_datasets = {}, {}
        else:
            all_datasets = {
                n: d
                for n, d in bohr_repo.datasets.items()
                if d.artifact_type == artifact_type
            }
            train_datasets, test_datasets = train_and_test(all_datasets, label_column)
        heuristic_groups = get_heuristic_module_list(
            artifact_type, path_config.heuristics
        )
        task = Task(
            name,
            authors,
            description,
            artifact_type,
            label_list,
            train_datasets,
            test_datasets,
            label_column,
            heuristic_groups,
        )
        bohr_repo.tasks[name] = task
        bohr_repo.dump(project_root)
        if repro:
            logger.info("Re-running the pipeline ...")
            api.repro(name, bohr_repo=bohr_repo)

コード例 #16

0

ファイルを表示

ファイル: commands.py プロジェクト: giganticode/bohr-framework

def train_label_model(task: str, target_dataset: str):
    from bohr.pipeline.train_label_model import train_label_model

    setup_loggers()
    bohr_repo = load_bohr_repo()
    path_config = PathConfig.load()
    task = bohr_repo.tasks[task]
    target_dataset = bohr_repo.datasets[target_dataset]
    stats = train_label_model(task, target_dataset, path_config)
    with open(path_config.metrics / task.name / "label_model_metrics.json",
              "w") as f:
        json.dump(stats, f)
    pprint(stats)

コード例 #17

0

ファイルを表示

    def __init__(
        self,
        task: str,
        labeled_dataset_name: str,
        rev: Optional[str] = "master",
        force_update: bool = False,
    ):
        path_config = PathConfig.load()
        path_to_old_revision = get_path_to_revision(path_config.project_root,
                                                    rev, force_update)
        self.labeled_dataset_name = labeled_dataset_name
        logging.disable(logging.WARNING)
        labeled_dataset_path = (path_config.labeled_data_dir / task /
                                f"{labeled_dataset_name}.labeled.csv")
        with dvc.api.open(labeled_dataset_path, path_to_old_revision) as f:
            old_df = pd.read_csv(f)
        with dvc.api.open(labeled_dataset_path) as f:
            new_df = pd.read_csv(f)
        logging.disable(logging.NOTSET)

        self.is_test_set = "bug" in old_df.columns

        old_df_columns = ["prob_CommitLabel.BugFix"]
        if self.is_test_set:
            old_df_columns.append("bug")
        self.combined_df = pd.concat(
            [
                old_df[old_df_columns],
                new_df["prob_CommitLabel.BugFix"].rename(
                    "prob_CommitLabel.BugFix_new"),
            ],
            axis=1,
        )
        if self.is_test_set:
            self.combined_df.loc[:, "improvement"] = (
                self.combined_df["prob_CommitLabel.BugFix_new"] -
                self.combined_df["prob_CommitLabel.BugFix"]) * (
                    self.combined_df["bug"] * 2 - 1)

        self.combined_df.loc[:, "certainty"] = (
            np.abs(self.combined_df["prob_CommitLabel.BugFix_new"] - 0.5) * 2)
        if self.is_test_set:
            self.combined_df.loc[:, "precision"] = 1 - np.abs(
                self.combined_df["prob_CommitLabel.BugFix_new"] -
                self.combined_df["bug"])

        self.combined_df = pd.concat([self.combined_df, old_df["message"]],
                                     axis=1)
        if "url" in old_df.columns:
            self.combined_df["url"] = old_df["url"]

コード例 #18

0

ファイルを表示

def deserialize_bohr_repo(dct,
                          cls,
                          path_config: Optional[PathConfig] = None,
                          **kwargs) -> BohrRepo:
    """
    >>> jsons.loads('{"bohr_framework_version": 0.1, "tasks": {}, "datasets": {}, "dataset-linkers": {}}', BohrRepo, \
path_config={'project_root': '/'})
    BohrRepo(bohr_framework_version=0.1, tasks={}, datasets={}, linkers=[])
    """
    path_config = path_config or PathConfig.load()
    datasets: Dict[str, Dataset] = {}
    for dataset_name, dataset_object in dct["datasets"].items():
        datasets[dataset_name] = jsons.load(
            dataset_object,
            Dataset,
            dataset_name=dataset_name,
            downloaded_data_dir=path_config.downloaded_data_dir,
            data_dir=path_config.data_dir,
        )
    linkers = [
        jsons.load(
            dataset_linker_obj,
            DatasetLinker,
            datasets=datasets,
            data_dir=path_config.data_dir,
        ) for dataset_linker_obj in dct["dataset-linkers"]
    ]

    for dataset_name, dataset in datasets.items():
        dataset.mapper.linkers = []

    for linker in linkers:
        linker.from_.mapper.linkers = linkers

    tasks = dict()
    for task_name, task_json in dct["tasks"].items():
        tasks[task_name] = jsons.load(
            task_json,
            Task,
            task_name=task_name,
            heuristic_path=path_config.heuristics,
            datasets=datasets,
        )
    return BohrRepo(
        dct["bohr_framework_version"],
        tasks,
        datasets,
        linkers,
    )

コード例 #19

0

ファイルを表示

def add_dataset(
    task: Task,
    dataset: Dataset,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> Dataset:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    is_test_set = dataset.is_column_present(task.label_column_name)
    logger.info(
        f'Adding dataset {dataset.name} as a {"test" if is_test_set else "train"} set'
    )
    task.add_dataset(dataset, is_test_set)
    bohr_repo.dump(path_config.project_root)
    return dataset

コード例 #20

0

ファイルを表示

def pull(
    task: str,
    target: str,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> RelativePath:
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    path = path_config.labeled_data_dir / task / f"{target}.labeled.csv"
    if path.exists():
        logger.info(dvc.pull([str(path)]))
        return path
    else:
        raise BohrDatasetNotFound(
            f"Dataset {target} in task {task} not found! Available datasets in this task: {list(bohr_repo.tasks[task].datasets.keys())}"
        )

コード例 #21

0

ファイルを表示

ファイル: train_label_model.py プロジェクト: giganticode/bohr-framework

def train_label_model(
        task: Task,
        target_dataset: Dataset,
        path_config: Optional[PathConfig] = None) -> Dict[str, Any]:
    path_config = path_config or PathConfig.load()

    task_dir_generated = path_config.generated / task.name
    if not task_dir_generated.exists():
        task_dir_generated.mkdir()

    lines_train = pd.read_pickle(
        str(task_dir_generated /
            f"heuristic_matrix_{target_dataset.name}.pkl"))
    label_model = fit_label_model(lines_train.to_numpy())
    label_model.save(str(task_dir_generated / "label_model.pkl"))
    label_model.eval()

    label_model_weights_file = (path_config.generated / task.name /
                                f"label_model_weights.csv")

    df = pd.DataFrame(
        label_model.mu.cpu().detach().numpy().reshape(-1, 4),
        columns=["00", "01", "10", "11"],
        index=lines_train.columns,
    )
    df.to_csv(label_model_weights_file, index_label="heuristic_name")

    stats = {}
    for test_set_name, test_set in task._test_datasets.items():
        df = test_set.load()
        if task.label_column_name not in df.columns:
            raise GroundTruthColumnNotFound(
                f"Dataset {test_set_name} is added as a test set to the {task.name} task.\n"
                f"However, column with ground-thruth labels '{task.label_column_name}' not found."
            )
        stats.update(
            calculate_metrics(
                label_model,
                test_set_name,
                df[task.label_column_name].values,
                save_to=task_dir_generated,
            ))

    return stats

コード例 #22

0

ファイルを表示

def label_dataset(
    task: Task,
    dataset: Dataset,
    path_config: Optional[PathConfig] = None,
    debug: bool = False,
):
    path_config = path_config or PathConfig.load()

    applied_heuristics_df = pd.read_pickle(
        str(path_config.generated / task.name /
            f"heuristic_matrix_{dataset.name}.pkl"))

    label_model = LabelModel()
    label_model.load(str(path_config.generated / task.name /
                         "label_model.pkl"))
    df = dataset.load()
    df_labeled = do_labeling(label_model, applied_heuristics_df.to_numpy(), df,
                             task.labels)

    if debug:
        for (
                heuristic_name,
                applied_heuristic_series,
        ) in applied_heuristics_df.iteritems():
            applied_heuristics_df[
                heuristic_name] = applied_heuristic_series.map({
                    0: heuristic_name,
                    1: heuristic_name,
                    -1: ""
                })
        col_lfs = applied_heuristics_df.apply(
            lambda row: ";".join([elm for elm in row if elm]), axis=1)
        df_labeled["lfs"] = col_lfs

    labeled_data_path = path_config.labeled_data / task.name
    if not labeled_data_path.exists():
        labeled_data_path.mkdir(parents=True)
    target_file = labeled_data_path / f"{dataset.name}.labeled.csv"
    df_labeled.to_csv(target_file, index=False)
    print(f"Labeled dataset has been written to {target_file}.")

コード例 #23

0

ファイルを表示

def repro(
    task: Optional[str] = None,
    only_transient: bool = False,
    force: bool = False,
    bohr_repo: Optional[BohrRepo] = None,
    path_config: Optional[PathConfig] = None,
) -> None:
    """
    # >>> import tempfile
    # >>> with tempfile.TemporaryDirectory() as tmpdirname:
    # ...     with open(Path(tmpdirname) / 'bohr.json', 'w') as f:
    # ...         print(f.write('{"bohr_framework_version": "0.3.9-rc", "tasks": {}, "datasets": {}}'))
    # ...     get_dvc_commands_to_repro(None, False, load_config(Path(tmpdirname)))
    """
    path_config = path_config or PathConfig.load()
    bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root)

    refresh_if_necessary(path_config)

    paths_to_pull = [str(d.path_dist) for d in bohr_repo.datasets.values()]
    if len(paths_to_pull) > 0:
        logger.info(dvc.pull(paths_to_pull))

    # TODO run only task-related transient stages if task is passed:
    transient_stages = load_transient_stages(path_config)
    if len(transient_stages) > 0:
        logger.info(
            dvc.repro(transient_stages, force=force, path_config=path_config))

    if not only_transient:
        glob = None
        if task:
            if task not in bohr_repo.tasks:
                raise ValueError(f"Task {task} not found in bohr.json")
            glob = f"{task}_*"
        logger.info(
            dvc.repro(pull=True,
                      glob=glob,
                      force=force,
                      path_config=path_config))

コード例 #24

0

ファイルを表示

ファイル: combine_labels.py プロジェクト: giganticode/bohr

import sys
from pathlib import Path

import pandas as pd

from bohr.config.pathconfig import PathConfig


def combine_labels(path_to_labeled_dataset: Path, path_to_transformer_labels,
                   output_path: Path) -> None:
    labeled_dataset = pd.read_csv(path_to_labeled_dataset)
    transformer_labels = pd.read_csv(
        path_to_transformer_labels)['prediction'].rename('transformer_preds')
    combined = pd.concat([labeled_dataset, transformer_labels], axis=1)
    combined.to_csv(output_path)


if __name__ == '__main__':
    project_root = PathConfig.load().project_root
    combine_labels(project_root / Path(sys.argv[1]),
                   project_root / Path(sys.argv[2]),
                   project_root / Path(sys.argv[3]))

コード例 #25

0

ファイルを表示

def refresh(path_config: Optional[PathConfig] = None) -> None:
    path_config = path_config or PathConfig.load()
    (path_config.project_root / "dvc.yaml").unlink(missing_ok=True)
    add_all_tasks_to_dvc_pipeline()
    update_lock(path_config)

コード例 #26

0

ファイルを表示

ファイル: refactoring_miner.py プロジェクト: giganticode/bohr-framework

 def __init__(self):
     super().__init__()
     path_config = PathConfig.load()
     refactoring_miner_dir = os.listdir(path_config.software_path)[0]
     logger.debug(f"Using RefactoringMiner version {refactoring_miner_dir}")
     self.path = path_config.software_path / refactoring_miner_dir / "bin"

コード例 #27

0

ファイルを表示

ファイル: apply_heuristics.py プロジェクト: giganticode/bohr-framework

    if not heuristics:
        raise ValueError(
            f"Heuristics not found for artifact: {task.top_artifact}")

    save_to_matrix = task_dir_generated / f"heuristic_matrix_{dataset.name}.pkl"
    save_to_metrics = task_dir_metrics / f"heuristic_metrics_{dataset.name}.json"
    labeling_functions = to_labeling_functions(heuristics, dataset.mapper,
                                               task.labels)
    artifact_df = dataset.load()
    apply_lf_matrix = apply_lfs_to_dataset(labeling_functions,
                                           artifact_df=artifact_df,
                                           save_to=save_to_matrix)
    label_series = (artifact_df[task.label_column_name]
                    if task.label_column_name in artifact_df.columns else None)
    calculate_metrics(apply_lf_matrix,
                      labeling_functions,
                      label_series,
                      save_to=save_to_metrics)


if __name__ == "__main__":
    bohr_repo = load_bohr_repo()
    task = bohr_repo.tasks["bugginess"]
    dataset = bohr_repo.datasets["1151-commits"]
    apply_heuristics(
        task,
        PathConfig.load(),
        "heuristics.bugginess.main_heurstics",
        dataset,
    )