def extract_data_from_corpus(cls, corpus_path: Path, trainevals: List[str], groups: List[str], output_path: Path, ): # 1. Prepare output path if output_path.is_dir(): cls.logger.warning(f"{output_path} already exists, will overwrite the files.") elif output_path.is_file(): LoggingUtils.log_and_raise(cls.logger, f"{output_path} already exists as a file. Aborting.", Exception) else: IOUtils.mk_dir(output_path) # end if assert all([traineval in Macros.DS_TRAINEVALS for traineval in trainevals]) assert all([group in Macros.DS_GROUPS+[Macros.DS_GROUP_TA] for group in groups]) data_mgr = FilesManager(corpus_path) # 2. Load lemmas and definitions lemmas_filtered: List[Lemma] = data_mgr.load_data([FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma) definitions: List[Definition] = data_mgr.load_data([FilesManager.DEFINITIONS, "definitions.json"], IOUtils.Format.json, clz=Definition) # 3. Output to output_path for each combination of traineval and group for traineval in trainevals: for group in groups: IOUtils.mk_dir(output_path/f"{group}-{traineval}") data_indexes = IOUtils.load(project_dir/"training"/f"{group}-{traineval}.json"], IOUtils.Format.json, clz=str) IOUtils.dump(output_path/f"{group}-{traineval}/lemmas.json", IOUtils.jsonfy([l for l in lemmas_filtered if l.data_index in data_indexes]), IOUtils.Format.json) IOUtils.dump(output_path/f"{group}-{traineval}/definitions.json", IOUtils.jsonfy([d for d in definitions if d.data_index in data_indexes]), IOUtils.Format.json) # end for # end for return
def write_seq_len_stat(num_pa, ref_modelname): stat_list = list() src_l = list() data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), f"data/vhdl/{ref_modelname}") for mode in ["train", "val", "test"]: src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt") src_l += [ l.split() for l in IOUtils.load( src_l_file, IOUtils.Format.txt).strip().splitlines() ] stat_list.append(get_seq_len_stat(src_l)) for i in range(num_pa): src_pa = list() result_list = list() for mode in ["train", "val", "test"]: src_pa_file = os.path.join(data_dir, f"src.prevassign{i}.{mode}.txt") src_pa += [ l.split() for l in IOUtils.load( src_pa_file, IOUtils.Format.txt).strip().splitlines() ] for j, pa in enumerate(src_pa): if pa != ["<empty>"]: src_l[j] = pa + src_l[j] result_list.append(src_l[j]) stat_list.append(get_seq_len_stat(result_list)) results_file = os.path.join("../slpproject/_results/vhdl/ALL/metrics", f"lhs-pa-len-stat.json") IOUtils.dump(results_file, stat_list, IOUtils.Format.json) return
def test_format_yaml(self): """ Tests for IOUtils.Format.yaml """ objs = [ 42.001, "aaa", [13, "24", 56.7], { "name": "K", "job": "Y" }, ] exp_strs = [ "42.001\n...\n", "aaa\n...\n", "- 13\n- '24'\n- 56.7\n", "job: Y\nname: K\n", # dictionary are forced to be sorted ] for obj, exp_str in zip(objs, exp_strs): path = Path(tempfile.mktemp()) # Test dump IOUtils.dump(path, obj, IOUtils.Format.yaml) self.assertEqual(exp_str, self.load_plain(path)) # Test load loaded = IOUtils.load(path, IOUtils.Format.yaml) self.assertEqual(obj, loaded) self.rm(path)
def data_cut(self, data_size: int): """cut down the dataset to data_size, then save the projects list to data_dir""" collected_projects_file = Macros.data_dir / "projects-github.txt" self.collected_projects_list = list() if collected_projects_file.exists(): self.collected_projects_list += IOUtils.load( collected_projects_file, IOUtils.Format.txt).splitlines() # end if project_name_list = list() for project_url in self.collected_projects_list: user_repo = self.parse_github_url(project_url) project_name_list.append(f"{user_repo[0]}_{user_repo[1]}") all_used_projects = [ str(x).split("/")[-1] for x in Macros.repos_results_dir.iterdir() if x.is_dir() ] # Find the overlapping projects and select the top data_size projects overall_project_num = 0 reduced_project_list = list() for p in project_name_list: if p in all_used_projects and overall_project_num < data_size: # load the revision data filtered_methods = IOUtils.load(Macros.repos_results_dir / p / "collector" / "method-project-revision.json") new_method_ids = [ delta_data["method_ids"] for delta_data in filtered_methods if delta_data["year"] == "2020_Jan_1" ][0] if len(new_method_ids) > 0: reduced_project_list.append(p) overall_project_num += 1 all_used_projects.remove(p) IOUtils.dump(Macros.data_dir / f"projects-github-{data_size}.json", reduced_project_list, IOUtils.Format.jsonNoSort)
def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None): """ Processes a file to get its lemmas and runs the model to get predictions. """ # Figure out which project we're at, and then load configs if prj_root is None: prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path) self.load_configs(prj_root) # Infer SerAPI options serapi_options = self.infer_serapi_options(prj_root) # If user provided compile_cmd, first compile the project if self.compile_cmd is not None: with IOUtils.cd(prj_root): BashUtils.run(self.compile_cmd, expected_return_code=0) # Parse file data = self.parse_file(file_path, prj_root, serapi_options) # Load model self.load_local_model(prj_root) model = self.get_model() # Use the model to make predictions # Temp dirs for processed data and results temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize")) # Dump lemmas & definitions temp_raw_data_dir = temp_data_dir / "raw" temp_raw_data_dir.mkdir() IOUtils.dump( temp_raw_data_dir / "lemmas.json", IOUtils.jsonfy(data.lemmas), IOUtils.Format.json, ) IOUtils.dump( temp_raw_data_dir / "definitions.json", IOUtils.jsonfy(data.definitions), IOUtils.Format.json, ) # Model-specific process temp_processed_data_dir = temp_data_dir / "processed" temp_processed_data_dir.mkdir() model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir) # Invoke eval candidates_logprobs = model.eval_impl( temp_processed_data_dir, beam_search_size=self.beam_search_size, k=self.k, ) # Save predictions IOUtils.rm_dir(temp_data_dir) # Report predictions self.report_predictions(data, candidates_logprobs) return
def split_project(self, method_file: Path, random_seed: int, debug: bool = False): """ Split projects into train, val, test according to the project names. Will get 2 new files: project-list.json, project-split.json. """ proj_list = set() with open(method_file, "r") as f: objects = ijson.items(f, "item") for o in objects: proj_list.add(o["prj_name"]) num_proj = len(proj_list) proj_list = list(proj_list) if debug: output_dir = Path("/tmp/nlpast-data-10") else: output_dir = Path("/tmp/nlpast-data-880") IOUtils.dump(output_dir / "project-list.json", proj_list) random.seed(random_seed) random.shuffle(proj_list) train_index = round(num_proj * 0.8) valid_index = train_index + round(num_proj * 0.1) train_projs = proj_list[:train_index] valid_projs = proj_list[train_index:valid_index] test_projs = proj_list[valid_index:] project_split = { "train": train_projs, "val": valid_projs, "test": test_projs } IOUtils.dump(output_dir / "project-split.json", project_split)
def process_data_impl( self, data_dir: Path, output_processed_data_dir: Path, ) -> NoReturn: lemmas: List[Lemma] = IOUtils.dejsonfy( IOUtils.load(data_dir / "lemmas.json", IOUtils.Format.json), List[Lemma]) definitions: List[Definition] = IOUtils.dejsonfy( IOUtils.load(data_dir / "definitions.json", IOUtils.Format.json), List[Definition]) docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers( lemmas, definitions) # Inputs all_inputs: Dict[str, List[List[str]]] = self.get_all_inputs( lemmas, docs_sub_tokenizers) for input_type, src_sentences in all_inputs.items(): IOUtils.dump( output_processed_data_dir / f"src.{input_type}.txt", "".join([" ".join(sent) + "\n" for sent in src_sentences]), IOUtils.Format.txt) # end for # Outputs IOUtils.dump( output_processed_data_dir / f"tgt.txt", "".join([ " ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas ]), IOUtils.Format.txt) super().process_data_impl(data_dir, output_processed_data_dir) return
def process_data(self, project_dir): try: revision_data = IOUtils.load(project_dir / "collector" / "method-project-revision.json") method_data = IOUtils.load(project_dir / "collector" / "method-data.json") output_dir = project_dir / "collector" method_project_evo = [] for year in BetaFilter.YEARS[:-1]: curr_time = f"{year}_Jan_1" curr_method_ids = \ [year_data["method_ids"] for year_data in revision_data if year_data["year"] == curr_time][0] next_time = f"{year + 1}_Jan_1" next_method_ids = \ [year_data["method_ids"] for year_data in revision_data if year_data["year"] == next_time][0] new_method_ids = list( set(next_method_ids) - set(curr_method_ids)) filtered_method_ids = BetaFilter.beta_filter( new_method_ids, curr_method_ids, method_data) method_project_evo.append({ "prj_name": revision_data[0]["prj_name"], "time": f"{curr_time}-{next_time}", "method_ids": filtered_method_ids }) IOUtils.dump(output_dir / "method-project-beta-filtered.json", IOUtils.jsonfy(method_project_evo), IOUtils.Format.json) return except: self.logger.info(f"Unexpected error: {sys.exc_info()[0]}") return
def main_val(modelname, ref_modelname): bleus, accs, exact_accs = [],[],[] target_list = get_targets(ref_modelname, "val") preds_list = get_baseline_preds(ref_modelname, "val") for pred, target in zip(preds_list, target_list): pred_split = [t for t in pred.split(" ") if t!=''] target_split = [t for t in target.split(" ") if t!=''] bleu = get_bleu(target=target_split, pred=pred_split) acc = get_accuracy(target=target_split, pred=pred_split) exact_acc = get_exact_match_accuracy(target=target_split, pred=pred_split) bleus.append(bleu) accs.append(acc) exact_accs.append(exact_acc) avg_bleu = np.mean(bleus) avg_acc = np.mean(accs) avg_exact_acc = np.mean(exact_accs) print(f"Average BLEU: {avg_bleu:.3f}, average accuracy: {avg_acc:.3f}, average exact match accuracy: {avg_exact_acc:.3f}") results_file = os.path.join(SAVE_DIR, modelname,"testlog.val.assignments.baseline.log") results = { "bleu-AVG": avg_bleu, "acc-AVG": avg_acc, "exact-acc-AVG": avg_exact_acc, "bleu": bleus, "acc": accs, "exact-acc": exact_accs, } IOUtils.dump(results_file, results, IOUtils.Format.jsonNoSort) IOUtils.dump(os.path.join(SAVE_DIR, modelname, "pred.val.assignments.baseline.log"), "".join([pred.strip()+"\n" for pred in preds_list]), IOUtils.Format.txt) return
def dump(self, path: Path): d = dict() for f in ["word_to_index", "index_to_word", "next_index", "counter"]: d[f] = getattr(self, f) # end for IOUtils.dump(path, d, IOUtils.Format.jsonPretty) return
def dump_data(self, rel_path: Union[str, List[str]], data: Any, fmt: IOUtils.Format, is_batched: bool = False, per_batch: int = 100, exist_ok: bool = False, ): abs_path = self.data_dir / self.assemble_rel_path(rel_path) if abs_path.exists() and not exist_ok: LoggingUtils.log_and_raise(self.logger, f"Cannot rewrite existing data at {abs_path}", IOError) # end if abs_path.parent.mkdir(parents=True, exist_ok=True) if not is_batched: if self.is_json_format(fmt): data = IOUtils.jsonfy(data) # end if IOUtils.dump(abs_path, data, fmt) else: # In batched mode, the data need to be slice-able and sizable IOUtils.rm(abs_path) abs_path.mkdir(parents=True) for batch_i in tqdm(range(math.ceil(len(data)/per_batch))): data_batch = data[per_batch*batch_i : per_batch*(batch_i+1)] if self.is_json_format(fmt): data_batch = IOUtils.jsonfy(data_batch) # end if IOUtils.dump(abs_path/f"batch-{batch_i}.{fmt.get_extension()}", data_batch, fmt) # end for # end if return
def convert_json2txt(config_dict, data_types: List[str] = None): if data_types is None: data_types = ["train", "val", "test"] for data_type in data_types: data_list = IOUtils.load( os.path.join(DATADIR, config_dict["intermediate_data_dir"], f"{data_type}.json"), IOUtils.Format.json) for src_type in config_dict["src_types"]: output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.{src_type}.{data_type}.txt") pa_i = int(config_dict["augment"]) if src_type == "l": field = "l" elif src_type == "type": field = "l-type" elif src_type == "prevassign": field = f"pa{pa_i}" elif src_type == "patype": field = f"pa{pa_i}-type" else: raise ValueError(f"Unknown src_type {src_type}") # end if with open(output_path, "w") as f: for data in data_list: if len(data[field]) == 0: if field.endswith("-type"): f.write("<pad>\n") else: f.write("<empty>\n") # end if else: f.write(data[field] + "\n") # end if # end for # end with # end for fn_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.fn.{data_type}.txt") IOUtils.dump(fn_output_path, "".join([data["file_sha"] + "\n" for data in data_list]), IOUtils.Format.txt) tgt_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"tgt.{data_type}.txt") # [3:-2]: remove prefix "<= " and suffix " ;" IOUtils.dump(tgt_output_path, "".join([data["r"][3:-2] + "\n" for data in data_list]), IOUtils.Format.txt) # end for print("Conversion into txt is done.") return
def prepare_configs_and_scripts(self, trials: List[int]): exp_dir = self.work_dir for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) model_dir = trial_dir / "models" IOUtils.mk_dir(model_dir) log_dir = trial_dir / "logs" IOUtils.mk_dir(log_dir) data = str(exp_dir / "data/code2seq") val_data = data + ".val.c2s" train_log = trial_dir / "training-trace.json" train_script_file = trial_dir / f"{Macros.train}.sh" # Copy config file BashUtils.run( f"cp {self.base_config_file} {trial_dir}/config.yaml", expected_return_code=0) output_file = trial_dir / "output_tmp.txt" reference_file = trial_dir / "ref_tmp.txt" config_file = trial_dir / "config.yaml" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s" output_file = trial_dir / f"output_{test_type}.txt" reference_file = trial_dir / f"ref_{test_type}.txt" test_script_file = trial_dir / f"{test_type}.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) return
def get_eval_stats(pred_file: Path, ref_file: Path, result_dir: Path): true_positive, false_positive, false_negative = 0, 0, 0 with open(pred_file, "r") as pf, open(ref_file, "r") as rf: pred_lines = pf.readlines() ref_lines = rf.readlines() true_positive, false_positive, false_negative = update_per_subtoken_statistics( zip(ref_lines, pred_lines), true_positive, false_positive, false_negative) precision, recall, f1 = calculate_results(true_positive, false_positive, false_negative) test_result = {"f1": f1, "precision": precision, "recall": recall} IOUtils.dump(result_dir, test_result, IOUtils.Format.jsonPretty)
def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None: """ :requires: the project is cloned and checked-out to the desired version. """ if not project.is_cloned: project.clone() project.checkout(project.data["sha"], is_forced=True) # end if # Check if the project is already compiled confirmation_file = "lpc-installed.txt" confirmation_content = project.revision + " " + BashUtils.run("opam list coq -s", expected_return_code=0).stdout.strip() if (project.checkout_dir/confirmation_file).is_file() and IOUtils.load(project.checkout_dir/confirmation_file, "txt") == confirmation_content: cls.logger.debug(f"Project {project.full_name} already installed") return # end if project.clean() # Install dependencies for dependency in project.data.get("dependencies", []): dependency_project = names_projects.get(dependency) if dependency_project is None: raise Exception(f"Cannot find dependency {dependency}") cls.logger.info(f"For Project {project.full_name}, installing dependency {dependency}") cls.install_coq_project(dependency_project, names_projects) # end for if "build_cmd" not in project.data: raise Exception(f"Project {project.full_name} does not have build_cmd") if "install_cmd" not in project.data: raise Exception(f"Project {project.full_name} does not have install_cmd") with IOUtils.cd(project.checkout_dir): # Build cls.logger.info(f"Project {project.full_name}: Building with {project.data['build_cmd']}") r = BashUtils.run(project.data["build_cmd"]) if r.return_code != 0: raise Exception(f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") else: cls.logger.debug(f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") # end if # Install cls.logger.info(f"Project {project.full_name}: Installing with {project.data['install_cmd']}") r = BashUtils.run(project.data["install_cmd"]) if r.return_code != 0: raise Exception(f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") else: cls.logger.debug(f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") # end if IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt") # end with return
def split_dataset_cross_file( cls, assignments_path: Path, output_dir: Path, seed, use_new_sub_tokenizer: bool, ): """Split dataset in a way that assignments that are in test set do not overlap with those in training/validation set. Specifically, we split the entire set of files in testing/training/validation. """ # Load the assignments dataset, as a flattened list data_list = cls.load_data_list(assignments_path) file_list = cls.shuffle_data(cls.extract_file_list(data_list), seed) val_data_list = list() test_data_list = list() train_data_list = list() for fsha in file_list: if len(test_data_list) < int(len(data_list) * 0.1): test_data_list.extend( cls.extract_assignments_from(fsha, data_list)) elif len(test_data_list) + len(train_data_list) < int( len(data_list) * 0.9): train_data_list.extend( cls.extract_assignments_from(fsha, data_list)) else: val_data_list.extend( cls.extract_assignments_from(fsha, data_list)) statistics = { "num-data": len(data_list), "num-data-train": len(train_data_list), "num-data-val": len(val_data_list), "num-data-test": len(test_data_list), "num-files": len(file_list), } IOUtils.mk_dir(output_dir) cls.dump_data_list(output_dir / "train.json", train_data_list) cls.dump_data_list(output_dir / "val.json", val_data_list) cls.dump_data_list(output_dir / "test.json", test_data_list) IOUtils.dump(output_dir / "statistics.json", statistics, IOUtils.Format.jsonNoSort) IOUtils.dump(output_dir / "files.json", file_list, IOUtils.Format.jsonNoSort) return
def parse_projects(cls, project_list_file): """ Parse the project list file provided by DeepCom and return the github url file. """ project_list = IOUtils.load(project_list_file, IOUtils.Format.txt).splitlines() git_urls = list() for project in project_list: project_name = project.split("_", 1) git_urls.append( f"https://github.com/{project_name[0]}/{project_name[1]}.git") IOUtils.dump(Macros.data_dir / "DeepCom-projects-github.txt", "".join([url + "\n" for url in git_urls]), IOUtils.Format.txt)
def prepare_configs_and_scripts(self, trials: List[int]): exp_dir = self.work_dir for trial in trials: trial_dir = exp_dir/f"trial-{trial}" IOUtils.mk_dir(trial_dir) train_script_file = trial_dir/"train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 train.py " \ f"-data {self.data_dir}/transformer -save_model {trial_dir}/bestTransformer "\ f"-layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 "\ f"-encoder_type transformer -decoder_type transformer -position_encoding "\ f"-train_steps 50000 -max_generator_batches 2 -dropout 0.1 "\ f"-batch_size 4096 -batch_type tokens -normalization tokens -accum_count 2 "\ f"-optim adam -adam_beta2 0.998 -decay_method noam -warmup_steps 8000 -learning_rate 2 " \ f"-max_grad_norm 0 -param_init 0 -param_init_glorot -early_stopping 10 -keep_checkpoint 1 " \ f"-label_smoothing 0.1 -valid_steps 500 -save_checkpoint_steps 500 -report_every 500 " \ f"--world_size 1 --gpu_ranks 0 " \ f"&> {trial_dir}/train-log.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_script_file = trial_dir/f"{test_type}.sh" output_file = trial_dir / f"output_{test_type}.txt" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 translate.py "\ f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\ f"&> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py " \ f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) # end for return
def convert_json2txt(config_dict, data_types: List[str] = None): if data_types is None: data_types = ["train", "val", "test"] for data_type in data_types: data_list = IOUtils.load(os.path.join(DATADIR, config_dict["intermediate_data_dir"], f"{data_type}.json"), IOUtils.Format.json) for src_type in config_dict["src_types"]: output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.{src_type}.{data_type}.txt") num_pa = int(config_dict["num_pa"]) with open(output_path, "w") as f: for data in data_list: if src_type == "l": seq = data["l"] for pa_i in range(num_pa): seq += " " + data[f"pa{pa_i+1}"] # end for elif src_type == "type": seq = data["l-type-each-token"] for pa_i in range(num_pa): seq += " " + data[f"pa{pa_i+1}-type"] # end for else: raise ValueError(f"Unknown src_type {src_type}") # end if if len(seq) == 0: if src_type == "type": f.write("<pad>\n") else: f.write("<empty>\n") # end if else: f.write(seq+"\n") # end if # end for # end with # end for fn_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.fn.{data_type}.txt") IOUtils.dump(fn_output_path, "".join([data["file_sha"]+"\n" for data in data_list]), IOUtils.Format.txt) tgt_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"tgt.{data_type}.txt") # [3:-2]: remove prefix "<= " and suffix " ;" IOUtils.dump(tgt_output_path, "".join([data["r"][3:-2]+"\n" for data in data_list]), IOUtils.Format.txt) # end for print("Conversion into txt is done.") return
def process_data_impl(self, data_dir: Path, output_processed_data_dir: Path, ) -> NoReturn: lemmas: List[Lemma] = IOUtils.dejsonfy(IOUtils.load(data_dir/"lemmas.json", IOUtils.Format.json), List[Lemma]) definitions: List[Definition] = IOUtils.dejsonfy(IOUtils.load(data_dir/"definitions.json", IOUtils.Format.json), List[Definition]) docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(lemmas, definitions) # Put data in serialized files IOUtils.dump(output_processed_data_dir/f"src.txt", "".join([" ".join(self.get_input(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]), IOUtils.Format.txt) IOUtils.dump(output_processed_data_dir/f"tgt.txt", "".join([" ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]), IOUtils.Format.txt) return
def prepare_configs_and_scripts(self, trials: List[int]): with open(self.base_config_file, "r") as f: base_config = yaml.load(f) exp_dir = self.work_dir for trial in trials: seed = random.randint(0, 9) trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) config = copy.copy(base_config) config["data"] = str(self.data_dir / "biLSTM") config["save_model"] = str(trial_dir / "bestLSTM") config_file = trial_dir / "config.yaml" with open(config_file, "w+") as f: yaml.dump(config, f) train_script_file = trial_dir/"train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 train.py --config {config_file} --world_size 1 --gpu_ranks 0 -keep_checkpoint 1 " \ f"--seed {seed} &> {trial_dir}/train-log.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_script_file = trial_dir/f"{test_type}.sh" output_file = trial_dir / f"output_{test_type}.txt" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 translate.py "\ f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\ f"&> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py " \ f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) # end for return
def test_format_txt_list(self): """ Tests for IOUtils.Format.txtList """ obj = ["abcde", "12345", "x y z"] path = Path(tempfile.mktemp()) expected = "abcde\n12345\nx y z\n" # Test dump IOUtils.dump(path, obj, IOUtils.Format.txtList) self.assertEqual(expected, self.load_plain(path)) # Test load loaded = IOUtils.load(path, IOUtils.Format.txtList) self.assertEqual(obj, loaded) self.rm(path)
def test_format_json_list(self): """ Tests for IOUtils.Format.jsonList """ obj = ["abcde", [1, 2, 3], {"abc": "def"}] path = Path(tempfile.mktemp()) expected = '"abcde"\n[1, 2, 3]\n{"abc": "def"}\n' # Test dump IOUtils.dump(path, obj, IOUtils.Format.jsonList) self.assertEqual(expected, self.load_plain(path)) # Test load loaded = IOUtils.load(path, IOUtils.Format.jsonList) self.assertEqual(obj, loaded) self.rm(path)
def compute_bleu(cls, references:str, hypotheses: str, test_result_file: str) -> int: with open(references, 'r') as fr, open(hypotheses, 'r') as fh: refs = fr.readlines() hyps = fh.readlines() bleu_4_sentence_scores = [] for ref, hyp in zip(refs, hyps): if len(hyp.strip().split()) < 2: bleu_4_sentence_scores.append(0) else: bleu_4_sentence_scores.append(sentence_bleu([ref.strip().split()], hyp.strip().split(), smoothing_function=SmoothingFunction().method2, auto_reweigh=True)) score = 100 * sum(bleu_4_sentence_scores) / float(len(bleu_4_sentence_scores)) result = {"bleu": score} IOUtils.dump(test_result_file, result) return score
def process_data(self, method_data_list: List[MethodData], data_type: str, output_dir: Path, split: bool = True): Environment.require_collector() log_file = output_dir / "collector-log.txt" data_file = output_dir / "method-data.json" IOUtils.dump(data_file, IOUtils.jsonfy(method_data_list), IOUtils.Format.json) config = { "transform": True, "model": "BiLSTM", "dataType": data_type, "dataFile": str(data_file), "logFile": str(log_file), "outputDir": str(output_dir), } config_file = output_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stdout: self.logger.warning(f"Stdout of collector:\n{rr.stdout}") # end if if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # build raw dataset if split: self.tokenizeFile(output_dir / f"{data_type}.raw.txt", data_type) else: self.noSplit(output_dir / f"{data_type}.raw.txt", data_type) error_ids = IOUtils.load(str(output_dir) + "-error-ids.json") print(f"Number of error id is: {len(error_ids)}") # BashUtils.run(f"rm {output_dir}-error-ids.json", expected_return_code=0) return error_ids
def collect_lat_results(self, model: str, metrics: List[str], debug: bool = False): lat_results = {k: 0 for k in metrics} model_work_dir = Macros.data_dir / "models-work" / f"{model}-latest" if not debug \ else Macros.data_dir / "models-work" / f"{model}-latest-debug" for trial in range(Macros.trials): trial_dir = model_work_dir / "latest" / f"trial-{trial}" result_file = f"{trial_dir}/test_result.json" metrics_dict = IOUtils.load(result_file) for k, v in metrics_dict.items(): lat_results[k.lower()] += v for k, v in lat_results.items(): lat_results[k] = round(v / Macros.trials, 2) output_dir = Macros.results_dir / "metrics" IOUtils.dump(output_dir / f"{model}-latest-results.json", lat_results, IOUtils.Format.jsonPretty)
def write_results(targets: List[List[str]], preds: List[List[str]], results_dir: Path, mode="test"): bleu_scores = list() acc_scores = list() exact_acc_scores = list() results_preds = "" for t, p in zip(targets, preds): bleu_score = get_bleu(t, p) acc_score = get_accuracy(t, p) exact_acc_score = get_exact_match_accuracy(t, p) bleu_scores.append(bleu_score) acc_scores.append(acc_score) exact_acc_scores.append(exact_acc_score) results_preds += " ".join(p) results_preds += "\n" # end for avg_bleu = np.mean(bleu_scores) avg_acc = np.mean(acc_scores) avg_exact_acc = np.mean(exact_acc_scores) print( f"Average BLEU: {avg_bleu:.3f}, average accuracy: {avg_acc:.3f}, average exact match accuracy: {avg_exact_acc:.3f}" ) results = { "bleu-AVG": avg_bleu, "acc-AVG": avg_acc, "exact-acc-AVG": avg_exact_acc, "bleu": bleu_scores, "acc": acc_scores, "exact-acc": exact_acc_scores, } isval = ".assignments" if mode == "test" else ".val.assignments" results_dir.mkdir(parents=True, exist_ok=True) results_file: Path = results_dir / f"testlog{isval}.ngram.log" pred_file: Path = results_dir / f"pred{isval}.ngram.log" IOUtils.dump(results_file, results, IOUtils.Format.jsonNoSort) IOUtils.dump(pred_file, results_preds, IOUtils.Format.txt) return
def get_available_projects(self) -> List[str]: project_urls = IOUtils.load(Macros.data_dir / "projects.txt", IOUtils.Format.txt).splitlines() project_names = DataCollector.urls_to_names(project_urls) project_names_in_db = self.database.ls_projects() projects_not_collected = [ p for p in project_names if p not in project_names_in_db ] if len(projects_not_collected) > 0: self.logger.warning( f"Ignoring {len(projects_not_collected)} projects whose data is not collected." ) IOUtils.dump(self.output_dir / "projects-not-collected.txt", "".join([p + "\n" for p in projects_not_collected]), IOUtils.Format.txt) # end if project_names = [p for p in project_names if p in project_names_in_db] return project_names
def collect_all_results(self, model: str, metrics: List[str]): # Mapping of eval_setting-year -> metric -> test_set -> [trials] all_results: Dict[str, Dict[str, Dict[str, List[any]]]] # Load existing results, if any results_file = Macros.results_dir / "metrics" / f"results-trials-{model}.json" if results_file.exists(): self.logger.info(f"Loading existing metrics from {results_file}") all_results = IOUtils.load(results_file) else: all_results = {} model_work_dir = Macros.data_dir / "models-work" / model for eval_setting in self.EVAL_SETTINGS: for year in self.YEARS: exp = f"{eval_setting}-{year}" exp_results = all_results.setdefault(exp, {}) for test_set in [Macros.test_common, Macros.test_standard]: set_results = exp_results.setdefault(test_set, {}) for trial in range(Macros.trials): trial_dir = model_work_dir / exp / f"trial-{trial}" cur_results_file = trial_dir / f"results_{test_set}.json" if not cur_results_file.exists(): self.logger.warning( f"Results not found at {cur_results_file}") # Set default value for set_results[mname], but don't touch existing results if any for mname in metrics: set_results.setdefault(mname, [None] * Macros.trials) else: results = IOUtils.load(cur_results_file) for mname in metrics: metric = results[mname] set_results.setdefault( mname, [None] * Macros.trials)[trial] = metric # Save extracted/updated results IOUtils.dump(results_file, all_results, IOUtils.Format.jsonPretty) return
def prepare_configs_and_scripts(self, trials: List[int]): data_dir = self.work_dir / "data" base_config = IOUtils.load(self.base_config_file, IOUtils.Format.jsonPretty) for trial in trials: trial_dir = self.work_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) config = copy.copy(base_config) config["data_dir"] = str(data_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(trial_dir / "output.txt") config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) training_trace_file = trial_dir / "training-trace.json" train_script_file = trial_dir / f"{Macros.train}.sh" # The gpu-id argument is necessary for tensorflow, even if we are using CUDA_VISIBLE_DEVICES train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --train-log {training_trace_file} --gpu-id $1 &> {trial_dir}/log-{Macros.train}.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: output_file = trial_dir / f"output_{test_type}.txt" config["output"] = str(output_file) test_config_file = trial_dir / f"config_{test_type}.json" IOUtils.dump(test_config_file, config, IOUtils.Format.jsonPretty) test_script_file = trial_dir / f"{test_type}.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {test_config_file} --eval {data_dir}/{test_type}/test.token.code {data_dir}/{test_type}/test.token.sbt {data_dir}/{test_type}/test.token.nl --gpu-id $1 &> {trial_dir}/log-{test_type}.txt\n" \ f"python3 Bleu.py {data_dir}/{test_type}/test.token.nl {trial_dir}/output_{test_type}.txt {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) return