def add_all_tasks_to_dvc_pipeline( bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None) -> None: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) create_directories_if_necessary(bohr_repo) all_tasks = sorted(bohr_repo.tasks.values(), key=lambda x: x.name) logger.info( f"Following tasks are added to the pipeline: {list(map(lambda x: x.name, all_tasks))}" ) all_keys = set() for keys in map(lambda t: t.datasets.keys(), all_tasks): all_keys.update(keys) all_datasets_used_in_tasks = list( map(lambda key: bohr_repo.datasets[key], all_keys)) logger.info(f"Datasets used in tasks:") for dataset in all_datasets_used_in_tasks: linked_datasets = dataset.get_linked_datasets() logger.info( f"{dataset.name} {'-> ' + str(list(map(lambda d: d.name, linked_datasets))) if linked_datasets else ''}" ) transient_stages = [] commands: List[DvcCommand] = [] for dataset_name, dataset in bohr_repo.datasets.items(): if dataset.preprocessor == "copy": copy_command = PreprocessCopyCommand(path_config, dataset) commands.append(copy_command) transient_stages.append(copy_command.get_name()) elif dataset.preprocessor == "7z": extract_command = Preprocess7zCommand(path_config, dataset) commands.append(extract_command) transient_stages.append(extract_command.get_name()) else: commands.append(PreprocessShellCommand(path_config, dataset)) commands.append(ParseLabelsCommand(path_config)) for task in all_tasks: for heuristic_group in task.heuristic_groups: for dataset_name, dataset in task.datasets.items(): datasets = [dataset_name] + list( map(lambda d: d.name, dataset.get_linked_datasets())) commands.append( ApplyHeuristicsCommand(path_config, task, heuristic_group, datasets)) commands.append(CombineHeuristicsCommand(path_config, task)) commands.append(TrainLabelModelCommand(path_config, task)) for dataset_name in task.datasets: commands.append( LabelDatasetCommand(path_config, task, dataset_name)) if path_config.manual_stages.exists(): root, dirs, files = next(os.walk(path_config.manual_stages)) for file in files: commands.append(ManualCommand(path_config, Path(root) / file)) for command in commands: completed_process = command.run() if completed_process.returncode != 0: print(completed_process.stderr.decode()) break save_transient_stages_to_config(transient_stages, path_config)
def _load_output_matrix_and_weights( task_name: str, labeled_dataset: str, rev: Optional[str] = None, force_update: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: path_config = PathConfig.load() logging.disable(logging.WARNING) repo = (get_path_to_revision( path_config.project_root, rev, force_update=force_update) if rev is not None else None) with dvc.api.open( path_config.generated_dir / task_name / f"heuristic_matrix_{labeled_dataset}.pkl", repo, mode="rb", ) as f: matrix = pd.read_pickle(BytesIO(f.read())) with dvc.api.open( path_config.generated_dir / task_name / f"label_model_weights.csv", repo) as f: weights = pd.read_csv(f, index_col="heuristic_name") logging.disable(logging.NOTSET) return matrix, weights
def status(bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None) -> str: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) bohr_repo.dump(path_config.project_root) refresh_if_necessary(path_config) return dvc.status(path_config)
def apply_heuristics( task: Task, heuristic_group: str, dataset: Dataset, path_config: Optional[PathConfig] = None, ) -> None: path_config = path_config or PathConfig.load() task_dir_generated, task_dir_metrics = create_dirs_if_necessary( task, path_config, heuristic_group=heuristic_group) heuristics = load_heuristics_from_module(task.top_artifact, heuristic_group) if not heuristics: raise ValueError( f"Heuristics not found for artifact: {task.top_artifact}") save_to_matrix = task_dir_generated / f"heuristic_matrix_{dataset.name}.pkl" save_to_metrics = task_dir_metrics / f"heuristic_metrics_{dataset.name}.json" labeling_functions = to_labeling_functions(heuristics, dataset.mapper, task.labels) artifact_df = dataset.load() apply_lf_matrix = apply_lfs_to_dataset(labeling_functions, artifact_df=artifact_df, save_to=save_to_matrix) label_series = (artifact_df[task.label_column_name] if task.label_column_name in artifact_df.columns else None) calculate_metrics(apply_lf_matrix, labeling_functions, label_series, save_to=save_to_metrics)
def load_transient_stages( path_config: Optional[PathConfig] = None) -> List[str]: path_config = path_config or PathConfig.load() transient_stages_file = path_config.project_root / ".bohr" / "transient_stages.json" if not transient_stages_file.exists(): return [] with transient_stages_file.open() as f: return json.load(f)
def dump(self, project_root: AbsolutePath) -> None: with open(project_root / "bohr.json", "w") as f: f.write( json.dumps( jsons.dump( self, data_dir=PathConfig.load(project_root).data_dir), indent=2, ))
def load(project_root: Optional[AbsolutePath] = None) -> "AppConfig": project_root = project_root or find_project_root() config_dict = load_config_dict_from_file(project_root) try: verbose_str = config_dict["core"]["verbose"] verbose = verbose_str == "true" or verbose_str == "True" except KeyError: verbose = False return AppConfig(verbose, PathConfig.load())
def refresh_if_necessary(path_config: Optional[PathConfig] = None) -> None: path_config = path_config or PathConfig.load() if not bohr_up_to_date(path_config): logger.info( "There are changes to the bohr config. Refreshing the workspace..." ) refresh(path_config) else: logger.info("Bohr config hasn't changed.")
def combine_applied_heuristics(task: Task, path_config: Optional[PathConfig] = None ) -> None: path_config = path_config or PathConfig.load() task_dir_generated = path_config.generated / task.name for dataset_name, dataset in task.datasets.items(): all_heuristics_file = (task_dir_generated / f"heuristic_matrix_{dataset_name}.pkl") matrix_list = [] all_heuristics = [] for heuristic_module_path in task.heuristic_groups: partial_heuristics_file = (task_dir_generated / heuristic_module_path / f"heuristic_matrix_{dataset_name}.pkl") matrix = pd.read_pickle(str(partial_heuristics_file)) matrix_list.append(matrix) heuristics = load_heuristics_from_module(task.top_artifact, heuristic_module_path) all_heuristics.extend(heuristics) labeling_functions = to_labeling_functions(all_heuristics, dataset.mapper, task.labels) all_heuristics_matrix = pd.concat(matrix_list, axis=1) if sum(all_heuristics_matrix.columns.duplicated()) != 0: s = set() for c in all_heuristics_matrix.columns: if c in s: raise ValueError(f"Duplicate heuristics are present: {c}") s.add(c) raise AssertionError() all_heuristics_matrix.to_pickle(str(all_heuristics_file)) artifact_df = dataset.load() label_series = (artifact_df[task.label_column_name] if task.label_column_name in artifact_df.columns else None) save_csv_to = path_config.generated / task.name / f"analysis_{dataset_name}.csv" save_json_to = path_config.metrics / task.name / f"analysis_{dataset_name}.json" save_metrics_to = (path_config.metrics / task.name / f"heuristic_metrics_{dataset_name}.json") run_analysis( all_heuristics_matrix.to_numpy(), labeling_functions, save_csv_to, save_json_to, label_series, ) stats = calculate_metrics( all_heuristics_matrix.to_numpy(), labeling_functions, label_series, save_to=save_metrics_to, ) pprint(stats)
def save_transient_stages_to_config( transient_stages: List[str], path_config: Optional[PathConfig] = None) -> None: path_config = path_config or PathConfig.load() conf_dir = path_config.project_root / ".bohr" if not conf_dir.exists(): conf_dir.mkdir() transient_stages_file = conf_dir / "transient_stages.json" with transient_stages_file.open("w") as f: json.dump(transient_stages, f)
def add( path: Path, artifact: str, name: Optional[str] = None, author: Optional[str] = None, description: Optional[str] = "", format: Optional[str] = None, preprocessor: Optional[str] = None, bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None, ) -> Dataset: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) destination_path = path_config.downloaded_data / path.name logger.info(f"Copying {path.name} to {destination_path} ...") shutil.copy(path, destination_path) dvc_output = dvc.add(destination_path, path_config.project_root) logger.info(dvc_output) file_name = path.name if preprocessor is None: file_name, preprocessor = extract_preprocessor_from_file_name( file_name) if format is None: file_name, format = extract_format_from_file_name(file_name) dataset_name = name or file_name if dataset_name in bohr_repo.datasets: message = f"Dataset with name {dataset_name} already exists." if name is None: message += ( "\nAre you trying to add the same dataset twice?\n" "If not, please specifying the `name` parameter explicitly.") raise ValueError(message) try: mapper = default_mappers[artifact_map[artifact]] except KeyError: mapper = load_class_by_full_path(artifact) path_preprocessed: RelativePath = get_preprocessed_path( None, relative_to_safe(destination_path, path_config.downloaded_data), path_config.data_dir, preprocessor, ) dataset = Dataset( dataset_name, author, description, path_preprocessed=path_preprocessed, path_dist=path_config.downloaded_data_dir / path.name, dataloader=CsvDatasetLoader(path_preprocessed, mapper()), preprocessor=preprocessor, ) bohr_repo.datasets[dataset.name] = dataset bohr_repo.dump(path_config.project_root) repro(bohr_repo=bohr_repo, path_config=path_config) return dataset
def create_directories_if_necessary( bohr_repo: Optional[BohrRepo] = None) -> None: bohr_repo = bohr_repo or load_bohr_repo() path_config = PathConfig.load() for task in bohr_repo.tasks.values(): for heuristic_group in task.heuristic_groups: (path_config.generated / task.name / heuristic_group).mkdir( exist_ok=True, parents=True) (path_config.metrics / task.name / heuristic_group).mkdir( exist_ok=True, parents=True) path_config.labeled_data.mkdir(exist_ok=True, parents=True)
def pull(task: str, target: str, verbose: bool = False): try: with verbosity(verbose): path_config = PathConfig.load() refresh_if_necessary(path_config) path = api.pull(task, target, path_config=path_config) logger.info( f"The dataset is available at {path_config.project_root / path}" ) except BohrDatasetNotFound as ex: logger.error(ex, exc_info=logger.getEffectiveLevel() == logging.DEBUG) exit(404)
def parse_labels(path_config: Optional[PathConfig] = None) -> None: path_config = path_config or PathConfig.load() label_tree_list = load_label_tree(path_config.labels) from jinja2 import Environment env = Environment(loader=FileSystemLoader(Path(__file__).parent.parent)) template = env.get_template("resources/labels.template") s = template.render(hierarchies=[ l for label_tree in label_tree_list for l in label_tree.flatten() ]) with open("labels.py", "w") as f: f.write(s)
def add( name: str, artifact: str, labels: str, label_column: str, authors: str, description: str, use_all_datasets: bool, repro: bool, force: bool, verbose: bool, ) -> None: with verbosity(verbose): project_root = find_project_root() bohr_repo = load_bohr_repo(project_root) path_config = PathConfig.load(project_root) if name in bohr_repo.tasks and not force: logger.error(f"Task {name} is already defined") exit(400) try: artifact_type = artifact_map[artifact] except KeyError: logger.error(f"Artifact not found: {artifact}") exit(404) label_list = list(map(lambda s: s.strip(), labels.split(","))) if not use_all_datasets: train_datasets, test_datasets = {}, {} else: all_datasets = { n: d for n, d in bohr_repo.datasets.items() if d.artifact_type == artifact_type } train_datasets, test_datasets = train_and_test(all_datasets, label_column) heuristic_groups = get_heuristic_module_list( artifact_type, path_config.heuristics ) task = Task( name, authors, description, artifact_type, label_list, train_datasets, test_datasets, label_column, heuristic_groups, ) bohr_repo.tasks[name] = task bohr_repo.dump(project_root) if repro: logger.info("Re-running the pipeline ...") api.repro(name, bohr_repo=bohr_repo)
def train_label_model(task: str, target_dataset: str): from bohr.pipeline.train_label_model import train_label_model setup_loggers() bohr_repo = load_bohr_repo() path_config = PathConfig.load() task = bohr_repo.tasks[task] target_dataset = bohr_repo.datasets[target_dataset] stats = train_label_model(task, target_dataset, path_config) with open(path_config.metrics / task.name / "label_model_metrics.json", "w") as f: json.dump(stats, f) pprint(stats)
def __init__( self, task: str, labeled_dataset_name: str, rev: Optional[str] = "master", force_update: bool = False, ): path_config = PathConfig.load() path_to_old_revision = get_path_to_revision(path_config.project_root, rev, force_update) self.labeled_dataset_name = labeled_dataset_name logging.disable(logging.WARNING) labeled_dataset_path = (path_config.labeled_data_dir / task / f"{labeled_dataset_name}.labeled.csv") with dvc.api.open(labeled_dataset_path, path_to_old_revision) as f: old_df = pd.read_csv(f) with dvc.api.open(labeled_dataset_path) as f: new_df = pd.read_csv(f) logging.disable(logging.NOTSET) self.is_test_set = "bug" in old_df.columns old_df_columns = ["prob_CommitLabel.BugFix"] if self.is_test_set: old_df_columns.append("bug") self.combined_df = pd.concat( [ old_df[old_df_columns], new_df["prob_CommitLabel.BugFix"].rename( "prob_CommitLabel.BugFix_new"), ], axis=1, ) if self.is_test_set: self.combined_df.loc[:, "improvement"] = ( self.combined_df["prob_CommitLabel.BugFix_new"] - self.combined_df["prob_CommitLabel.BugFix"]) * ( self.combined_df["bug"] * 2 - 1) self.combined_df.loc[:, "certainty"] = ( np.abs(self.combined_df["prob_CommitLabel.BugFix_new"] - 0.5) * 2) if self.is_test_set: self.combined_df.loc[:, "precision"] = 1 - np.abs( self.combined_df["prob_CommitLabel.BugFix_new"] - self.combined_df["bug"]) self.combined_df = pd.concat([self.combined_df, old_df["message"]], axis=1) if "url" in old_df.columns: self.combined_df["url"] = old_df["url"]
def deserialize_bohr_repo(dct, cls, path_config: Optional[PathConfig] = None, **kwargs) -> BohrRepo: """ >>> jsons.loads('{"bohr_framework_version": 0.1, "tasks": {}, "datasets": {}, "dataset-linkers": {}}', BohrRepo, \ path_config={'project_root': '/'}) BohrRepo(bohr_framework_version=0.1, tasks={}, datasets={}, linkers=[]) """ path_config = path_config or PathConfig.load() datasets: Dict[str, Dataset] = {} for dataset_name, dataset_object in dct["datasets"].items(): datasets[dataset_name] = jsons.load( dataset_object, Dataset, dataset_name=dataset_name, downloaded_data_dir=path_config.downloaded_data_dir, data_dir=path_config.data_dir, ) linkers = [ jsons.load( dataset_linker_obj, DatasetLinker, datasets=datasets, data_dir=path_config.data_dir, ) for dataset_linker_obj in dct["dataset-linkers"] ] for dataset_name, dataset in datasets.items(): dataset.mapper.linkers = [] for linker in linkers: linker.from_.mapper.linkers = linkers tasks = dict() for task_name, task_json in dct["tasks"].items(): tasks[task_name] = jsons.load( task_json, Task, task_name=task_name, heuristic_path=path_config.heuristics, datasets=datasets, ) return BohrRepo( dct["bohr_framework_version"], tasks, datasets, linkers, )
def add_dataset( task: Task, dataset: Dataset, bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None, ) -> Dataset: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) is_test_set = dataset.is_column_present(task.label_column_name) logger.info( f'Adding dataset {dataset.name} as a {"test" if is_test_set else "train"} set' ) task.add_dataset(dataset, is_test_set) bohr_repo.dump(path_config.project_root) return dataset
def pull( task: str, target: str, bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None, ) -> RelativePath: path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) path = path_config.labeled_data_dir / task / f"{target}.labeled.csv" if path.exists(): logger.info(dvc.pull([str(path)])) return path else: raise BohrDatasetNotFound( f"Dataset {target} in task {task} not found! Available datasets in this task: {list(bohr_repo.tasks[task].datasets.keys())}" )
def train_label_model( task: Task, target_dataset: Dataset, path_config: Optional[PathConfig] = None) -> Dict[str, Any]: path_config = path_config or PathConfig.load() task_dir_generated = path_config.generated / task.name if not task_dir_generated.exists(): task_dir_generated.mkdir() lines_train = pd.read_pickle( str(task_dir_generated / f"heuristic_matrix_{target_dataset.name}.pkl")) label_model = fit_label_model(lines_train.to_numpy()) label_model.save(str(task_dir_generated / "label_model.pkl")) label_model.eval() label_model_weights_file = (path_config.generated / task.name / f"label_model_weights.csv") df = pd.DataFrame( label_model.mu.cpu().detach().numpy().reshape(-1, 4), columns=["00", "01", "10", "11"], index=lines_train.columns, ) df.to_csv(label_model_weights_file, index_label="heuristic_name") stats = {} for test_set_name, test_set in task._test_datasets.items(): df = test_set.load() if task.label_column_name not in df.columns: raise GroundTruthColumnNotFound( f"Dataset {test_set_name} is added as a test set to the {task.name} task.\n" f"However, column with ground-thruth labels '{task.label_column_name}' not found." ) stats.update( calculate_metrics( label_model, test_set_name, df[task.label_column_name].values, save_to=task_dir_generated, )) return stats
def label_dataset( task: Task, dataset: Dataset, path_config: Optional[PathConfig] = None, debug: bool = False, ): path_config = path_config or PathConfig.load() applied_heuristics_df = pd.read_pickle( str(path_config.generated / task.name / f"heuristic_matrix_{dataset.name}.pkl")) label_model = LabelModel() label_model.load(str(path_config.generated / task.name / "label_model.pkl")) df = dataset.load() df_labeled = do_labeling(label_model, applied_heuristics_df.to_numpy(), df, task.labels) if debug: for ( heuristic_name, applied_heuristic_series, ) in applied_heuristics_df.iteritems(): applied_heuristics_df[ heuristic_name] = applied_heuristic_series.map({ 0: heuristic_name, 1: heuristic_name, -1: "" }) col_lfs = applied_heuristics_df.apply( lambda row: ";".join([elm for elm in row if elm]), axis=1) df_labeled["lfs"] = col_lfs labeled_data_path = path_config.labeled_data / task.name if not labeled_data_path.exists(): labeled_data_path.mkdir(parents=True) target_file = labeled_data_path / f"{dataset.name}.labeled.csv" df_labeled.to_csv(target_file, index=False) print(f"Labeled dataset has been written to {target_file}.")
def repro( task: Optional[str] = None, only_transient: bool = False, force: bool = False, bohr_repo: Optional[BohrRepo] = None, path_config: Optional[PathConfig] = None, ) -> None: """ # >>> import tempfile # >>> with tempfile.TemporaryDirectory() as tmpdirname: # ... with open(Path(tmpdirname) / 'bohr.json', 'w') as f: # ... print(f.write('{"bohr_framework_version": "0.3.9-rc", "tasks": {}, "datasets": {}}')) # ... get_dvc_commands_to_repro(None, False, load_config(Path(tmpdirname))) """ path_config = path_config or PathConfig.load() bohr_repo = bohr_repo or load_bohr_repo(path_config.project_root) refresh_if_necessary(path_config) paths_to_pull = [str(d.path_dist) for d in bohr_repo.datasets.values()] if len(paths_to_pull) > 0: logger.info(dvc.pull(paths_to_pull)) # TODO run only task-related transient stages if task is passed: transient_stages = load_transient_stages(path_config) if len(transient_stages) > 0: logger.info( dvc.repro(transient_stages, force=force, path_config=path_config)) if not only_transient: glob = None if task: if task not in bohr_repo.tasks: raise ValueError(f"Task {task} not found in bohr.json") glob = f"{task}_*" logger.info( dvc.repro(pull=True, glob=glob, force=force, path_config=path_config))
import sys from pathlib import Path import pandas as pd from bohr.config.pathconfig import PathConfig def combine_labels(path_to_labeled_dataset: Path, path_to_transformer_labels, output_path: Path) -> None: labeled_dataset = pd.read_csv(path_to_labeled_dataset) transformer_labels = pd.read_csv( path_to_transformer_labels)['prediction'].rename('transformer_preds') combined = pd.concat([labeled_dataset, transformer_labels], axis=1) combined.to_csv(output_path) if __name__ == '__main__': project_root = PathConfig.load().project_root combine_labels(project_root / Path(sys.argv[1]), project_root / Path(sys.argv[2]), project_root / Path(sys.argv[3]))
def refresh(path_config: Optional[PathConfig] = None) -> None: path_config = path_config or PathConfig.load() (path_config.project_root / "dvc.yaml").unlink(missing_ok=True) add_all_tasks_to_dvc_pipeline() update_lock(path_config)
def __init__(self): super().__init__() path_config = PathConfig.load() refactoring_miner_dir = os.listdir(path_config.software_path)[0] logger.debug(f"Using RefactoringMiner version {refactoring_miner_dir}") self.path = path_config.software_path / refactoring_miner_dir / "bin"
if not heuristics: raise ValueError( f"Heuristics not found for artifact: {task.top_artifact}") save_to_matrix = task_dir_generated / f"heuristic_matrix_{dataset.name}.pkl" save_to_metrics = task_dir_metrics / f"heuristic_metrics_{dataset.name}.json" labeling_functions = to_labeling_functions(heuristics, dataset.mapper, task.labels) artifact_df = dataset.load() apply_lf_matrix = apply_lfs_to_dataset(labeling_functions, artifact_df=artifact_df, save_to=save_to_matrix) label_series = (artifact_df[task.label_column_name] if task.label_column_name in artifact_df.columns else None) calculate_metrics(apply_lf_matrix, labeling_functions, label_series, save_to=save_to_metrics) if __name__ == "__main__": bohr_repo = load_bohr_repo() task = bohr_repo.tasks["bugginess"] dataset = bohr_repo.datasets["1151-commits"] apply_heuristics( task, PathConfig.load(), "heuristics.bugginess.main_heurstics", dataset, )