def suggest_lemmas(**options): from roosterize.data.DataMiner import DataMiner from roosterize.data.ModelSpec import ModelSpec from roosterize.ml.MLModels import MLModels project_path = Path(options["project"]).absolute() files = Utils.get_option_as_list(options, "files", None) exclude_files = Utils.get_option_as_list(options, "exclude-files", None) exclude_pattern = options.get("exclude-pattern", None) serapi_options = options.get("serapi-options", "") output_dir = Path(options["output"]).absolute() model_dir = Path(options["model-dir"]).absolute() # Extract data print(">>>>> Extracting lemmas ...") DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data") # Get the ML model print(">>>>> Initializing model ...") model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec) model = MLModels.get_model(model_dir, model_spec, is_eval=True) # Process data print(">>>>> Processing data ...") model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data") # Eval print(">>>>> Applying model ...") model.eval(output_dir/"eval-processed-data", output_dir/"eval-result") # Print suggestions print(">>>>> Suggestions:") print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt)) return
def data_cut(self, data_size: int): """cut down the dataset to data_size, then save the projects list to data_dir""" collected_projects_file = Macros.data_dir / "projects-github.txt" self.collected_projects_list = list() if collected_projects_file.exists(): self.collected_projects_list += IOUtils.load( collected_projects_file, IOUtils.Format.txt).splitlines() # end if project_name_list = list() for project_url in self.collected_projects_list: user_repo = self.parse_github_url(project_url) project_name_list.append(f"{user_repo[0]}_{user_repo[1]}") all_used_projects = [ str(x).split("/")[-1] for x in Macros.repos_results_dir.iterdir() if x.is_dir() ] # Find the overlapping projects and select the top data_size projects overall_project_num = 0 reduced_project_list = list() for p in project_name_list: if p in all_used_projects and overall_project_num < data_size: # load the revision data filtered_methods = IOUtils.load(Macros.repos_results_dir / p / "collector" / "method-project-revision.json") new_method_ids = [ delta_data["method_ids"] for delta_data in filtered_methods if delta_data["year"] == "2020_Jan_1" ][0] if len(new_method_ids) > 0: reduced_project_list.append(p) overall_project_num += 1 all_used_projects.remove(p) IOUtils.dump(Macros.data_dir / f"projects-github-{data_size}.json", reduced_project_list, IOUtils.Format.jsonNoSort)
def write_seq_len_stat(num_pa, ref_modelname): stat_list = list() src_l = list() data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), f"data/vhdl/{ref_modelname}") for mode in ["train", "val", "test"]: src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt") src_l += [ l.split() for l in IOUtils.load( src_l_file, IOUtils.Format.txt).strip().splitlines() ] stat_list.append(get_seq_len_stat(src_l)) for i in range(num_pa): src_pa = list() result_list = list() for mode in ["train", "val", "test"]: src_pa_file = os.path.join(data_dir, f"src.prevassign{i}.{mode}.txt") src_pa += [ l.split() for l in IOUtils.load( src_pa_file, IOUtils.Format.txt).strip().splitlines() ] for j, pa in enumerate(src_pa): if pa != ["<empty>"]: src_l[j] = pa + src_l[j] result_list.append(src_l[j]) stat_list.append(get_seq_len_stat(result_list)) results_file = os.path.join("../slpproject/_results/vhdl/ALL/metrics", f"lhs-pa-len-stat.json") IOUtils.dump(results_file, stat_list, IOUtils.Format.json) return
def load_data(num_pa, ref_modelname): src_dict = dict() stat_list = list() data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), f"data/vhdl/{ref_modelname}") for mode in ["train", "val", "test"]: src_l_file = os.path.join(data_dir, f"src.l.{mode}.txt") src_l = [ l.split() for l in IOUtils.load( src_l_file, IOUtils.Format.txt).strip().splitlines() ] src_r_file = os.path.join(data_dir, f"tgt.{mode}.txt") src_r = [ l.split() for l in IOUtils.load( src_r_file, IOUtils.Format.txt).strip().splitlines() ] src_seq = [l + ["<="] + r for l, r in zip(src_l, src_r)] for i in range(num_pa): src_pa_file = os.path.join(data_dir, f"src.prevassign{i}.{mode}.txt") src_pa = [ l.split() for l in IOUtils.load( src_pa_file, IOUtils.Format.txt).strip().splitlines() ] for j, pa in enumerate(src_pa): src_seq[j] = pa + src_seq[j] src_dict[f"{mode}"] = src_seq return src_dict
def process_data(self, project_dir): try: revision_data = IOUtils.load(project_dir / "collector" / "method-project-revision.json") method_data = IOUtils.load(project_dir / "collector" / "method-data.json") output_dir = project_dir / "collector" method_project_evo = [] for year in BetaFilter.YEARS[:-1]: curr_time = f"{year}_Jan_1" curr_method_ids = \ [year_data["method_ids"] for year_data in revision_data if year_data["year"] == curr_time][0] next_time = f"{year + 1}_Jan_1" next_method_ids = \ [year_data["method_ids"] for year_data in revision_data if year_data["year"] == next_time][0] new_method_ids = list( set(next_method_ids) - set(curr_method_ids)) filtered_method_ids = BetaFilter.beta_filter( new_method_ids, curr_method_ids, method_data) method_project_evo.append({ "prj_name": revision_data[0]["prj_name"], "time": f"{curr_time}-{next_time}", "method_ids": filtered_method_ids }) IOUtils.dump(output_dir / "method-project-beta-filtered.json", IOUtils.jsonfy(method_project_evo), IOUtils.Format.json) return except: self.logger.info(f"Unexpected error: {sys.exc_info()[0]}") return
def load_configs(self, prj_root: Optional[Path] = None, force_reload: bool = False): """ Load configs (first project-local, then global) to this user interface. """ # If the configs of the current project is already loaded, skip if not force_reload and prj_root is not None and prj_root == self.loaded_config_prj: return # Reset the project-local config indicator self.loaded_config_prj = None # First, load global config global_config_file = RoosterizeDirUtils.get_global_config_file() if global_config_file.exists(): global_config = IOUtils.load(global_config_file, IOUtils.Format.yaml) self.set_configs_from_dict(global_config, self.GLOBAL_CONFIGS) # Then, load local config if prj_root is not None: local_config_file = RoosterizeDirUtils.get_local_config_file( prj_root) if local_config_file.exists(): local_config = IOUtils.load(local_config_file, IOUtils.Format.yaml) self.set_configs_from_dict(local_config, self.LOCAL_CONFIGS) self.loaded_config_prj = prj_root
def process_data_impl( self, data_dir: Path, output_processed_data_dir: Path, ) -> NoReturn: lemmas: List[Lemma] = IOUtils.dejsonfy( IOUtils.load(data_dir / "lemmas.json", IOUtils.Format.json), List[Lemma]) definitions: List[Definition] = IOUtils.dejsonfy( IOUtils.load(data_dir / "definitions.json", IOUtils.Format.json), List[Definition]) docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers( lemmas, definitions) # Inputs all_inputs: Dict[str, List[List[str]]] = self.get_all_inputs( lemmas, docs_sub_tokenizers) for input_type, src_sentences in all_inputs.items(): IOUtils.dump( output_processed_data_dir / f"src.{input_type}.txt", "".join([" ".join(sent) + "\n" for sent in src_sentences]), IOUtils.Format.txt) # end for # Outputs IOUtils.dump( output_processed_data_dir / f"tgt.txt", "".join([ " ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas ]), IOUtils.Format.txt) super().process_data_impl(data_dir, output_processed_data_dir) return
def load_data(self, rel_path: Union[str, List[str]], fmt: IOUtils.Format, is_batched: bool = False, clz = None, ) -> Any: if self.is_json_format(fmt) and clz is None: self.logger.warning(f"Load data from {rel_path} with json format, but did not specify clz (at {traceback.format_stack()})") # end if abs_path = self.data_dir / self.assemble_rel_path(rel_path) if not abs_path.exists(): LoggingUtils.log_and_raise(self.logger, f"Cannot find data at {abs_path}", IOError) # end if if not is_batched: data = IOUtils.load(abs_path, fmt) if self.is_json_format(fmt) and clz is not None: data = IOUtils.dejsonfy(data, clz) # end if return data else: data = list() batch_numbers = sorted([int(str(f.stem).split("-")[1]) for f in abs_path.iterdir()]) for batch_number in tqdm(batch_numbers): batch_file = abs_path / f"batch-{batch_number}.{fmt.get_extension()}" data_batch = IOUtils.load(batch_file, fmt) if self.is_json_format(fmt) and clz is not None: data_batch = IOUtils.dejsonfy(data_batch, clz) # end if data.extend(data_batch) # end for return data
def read_data_preds(data_dir: Path, pred_file: Path, target_file: Path) -> Tuple[List[List[str]], List[List[str]], List[List[str]]]: # objflag = False # Input lhs inputs: List[List[str]] = [x.split() for x in IOUtils.load(data_dir/"src.l.test.txt", IOUtils.Format.txt).splitlines()] # Pred rhs preds: List[List[str]] = [x.split() for x in IOUtils.load(pred_file, IOUtils.Format.txt).splitlines()] # Target rhs targets: List[List[str]] = [x.split() for x in IOUtils.load(data_dir/target_file, IOUtils.Format.txt).splitlines()] return inputs, preds, targets
def clean_comgen_data(**options): from csevo.filter.DataFilter import DataFilter config_file_name = options.get("config") config_file = Macros.config_dir / config_file_name df = DataFilter(config_file) project_file = options.get("proj_file", Macros.data_dir / "projects-github-CG-100.json") projects = IOUtils.load(project_file) for proj in tqdm(projects): method_data_file = Macros.repos_results_dir / proj / "collector" / "method-data.json" filtered_data_file = Macros.repos_results_dir / proj / "collector" / "method-project-alpha-filtered.json" revision_data_file = Macros.repos_results_dir / proj / "collector" / "method-project-revision.json" # Data filtering and cleaning method_data_list = IOUtils.load(method_data_file) clean_method_data_list = list() clean_method_id_list = list() for ex in method_data_list: new_ex = ex new_ex["code"], new_ex["comment_summary"] = df.data_filter( ex["code"], ex["comment_summary"]) if new_ex["code"] != "" and new_ex["comment_summary"] != "": clean_method_data_list.append(new_ex) clean_method_id_list.append(new_ex["id"]) # dump the clean method index for comment generation task IOUtils.dump( Macros.repos_results_dir / proj / "collector" / "clean-method-idx.json", clean_method_id_list, IOUtils.Format.jsonNoSort) # update alpha-filtered data filtered_data_list = IOUtils.load(filtered_data_file) for delta_data in filtered_data_list: new_clean_filtered_method_ids = set( delta_data["method_ids"]).intersection(clean_method_id_list) delta_data["method_ids"] = list(new_clean_filtered_method_ids) IOUtils.dump( Macros.repos_results_dir / proj / "collector" / "method-project-CG-filtered.json", filtered_data_list, IOUtils.Format.jsonNoSort) # update project revision data revision_data_list = IOUtils.load(revision_data_file) for year_data in revision_data_list: new_clean_latest_method_ids = set( year_data["method_ids"]).intersection(clean_method_id_list) year_data["method_ids"] = list(new_clean_latest_method_ids) IOUtils.dump( Macros.repos_results_dir / proj / "collector" / "method-project-CG-revision.json", revision_data_list, IOUtils.Format.jsonNoSort)
def load_config(self) -> NoReturn: if self.config_file is not None: self.config_dict.update(IOUtils.load(self.config_file, IOUtils.Format.jsonPretty)) else: raise ValueError("Config file not set!") # end if return
def get_model_results_all_trials(cls, model: str) -> Dict[str, Dict[str, list]]: """ Gets the model's results, on each exp, of each metric, on test_common set, combining all trials. Returns: mapping of exp -> (mapping of metric -> list of results) """ results = IOUtils.load(Macros.results_dir / "metrics" / f"results-trials-{model}.json") results_all_trials = dict() for exp, exp_results in results.items(): exp_results_all_trials = dict() for test_set, set_results in exp_results.items(): # Only use test_common set if test_set != Macros.test_common: continue for metric, trials_results in set_results.items(): metric_results_all_trials = list() # Merge the results from all trials for trial_results in trials_results: if trial_results is not None: metric_results_all_trials += [n for n in trial_results if n != np.NaN and n != "NaN"] exp_results_all_trials[metric] = metric_results_all_trials results_all_trials[exp] = exp_results_all_trials return results_all_trials
def test_format_yaml(self): """ Tests for IOUtils.Format.yaml """ objs = [ 42.001, "aaa", [13, "24", 56.7], { "name": "K", "job": "Y" }, ] exp_strs = [ "42.001\n...\n", "aaa\n...\n", "- 13\n- '24'\n- 56.7\n", "job: Y\nname: K\n", # dictionary are forced to be sorted ] for obj, exp_str in zip(objs, exp_strs): path = Path(tempfile.mktemp()) # Test dump IOUtils.dump(path, obj, IOUtils.Format.yaml) self.assertEqual(exp_str, self.load_plain(path)) # Test load loaded = IOUtils.load(path, IOUtils.Format.yaml) self.assertEqual(obj, loaded) self.rm(path)
def make_numbers_timewise_filtered_dataset_metrics(self, dataset: str = "large", filter: str = "beta"): file = latex.File( self.tables_dir / f"numbers-time-wise-{filter}-filtered-{dataset}-dataset-metrics.tex" ) metrics = IOUtils.load( Macros.results_dir / "metrics" / f"time-wise-{filter}-filtered-{dataset}-dataset-stats.json", IOUtils.Format.json) for t in metrics.keys(): for k, v in metrics[t].items(): if k == "num-methods": file.append_macro( latex.Macro(f"{dataset}-{filter}-{t}-{k}", f"{v}")) # TODO: change back """ else: file.append_macro(latex.Macro(f"{dataset}-{filter}-{t}-{k}", "{:.1f}".format(v))) """ # end for file.save() return
def parse_file(self, file_path: Path, prj_root: Path, serapi_options: str): source_code = IOUtils.load(file_path, IOUtils.Format.txt) unicode_offsets = ParserUtils.get_unicode_offsets(source_code) with IOUtils.cd(prj_root): rel_path = file_path.relative_to(prj_root) ast_sexp_str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {rel_path}", expected_return_code=0).stdout tok_sexp_str = BashUtils.run( f"sertok {serapi_options} -- {rel_path}", expected_return_code=0).stdout ast_sexp_list: List[SexpNode] = SexpParser.parse_list(ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list(tok_sexp_str) doc = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets, ) doc.file_name = str(rel_path) # Collect lemmas & definitions lemmas: List[Lemma] = DataMiner.collect_lemmas_doc( doc, ast_sexp_list, serapi_options) definitions: List[Definition] = DataMiner.collect_definitions_doc( doc, ast_sexp_list) return ProcessedFile(file_path, source_code, doc, ast_sexp_list, tok_sexp_list, unicode_offsets, lemmas, definitions)
def collect_data(cls, **options) -> NoReturn: data_mgr = FilesManager(cls.dataset_dir) task = options["task"] projects_path = Path(options.get("corpus", cls.dataset_dir / "projects-standalone-8.10.yml")) projects: List[Project] = IOUtils.dejsonfy(IOUtils.load(projects_path, "json"), Project) if task == cls.TASK_COQ_DOCUMENTS: files = Utils.get_option_as_list(options, "files", None) is_verifying_tokenizer = Utils.get_option_as_boolean(options, "verify-tokenizer") cls.collect_coq_documents_projects(data_mgr, projects, files, is_verifying_tokenizer) elif task == cls.TASK_DATA_INDEXES: cls.collect_data_indexes(data_mgr, projects) elif task == cls.TASK_DEFINITIONS: cls.collect_definitions(data_mgr) elif task == cls.TASK_INSTALL_COQ_PROJECTS: cls.install_coq_projects(projects) elif task == cls.TASK_LEMMA: files = Utils.get_option_as_list(options, "files", None) cls.collect_lemmas(data_mgr, projects, files) elif task == cls.TASK_LEMMA_BACKEND_SEXP_TRANSFORMATIONS: cls.collect_lemmas_backend_sexp_transformations(data_mgr) elif task == cls.TASK_LEMMA_FILTERED: cls.filter_lemmas(data_mgr) elif task == cls.TASK_LEMMA_FOREEND_SEXP_TRANSFORMATIONS: cls.collect_lemmas_foreend_sexp_transformations(data_mgr) else: LoggingUtils.log_and_raise(cls.logger, f"Unknown task {task}", ValueError) # end if return
def process_data_impl(self, data_dir: Path, output_processed_data_dir: Path, ) -> NoReturn: lemmas: List[Lemma] = IOUtils.dejsonfy(IOUtils.load(data_dir/"lemmas.json", IOUtils.Format.json), List[Lemma]) definitions: List[Definition] = IOUtils.dejsonfy(IOUtils.load(data_dir/"definitions.json", IOUtils.Format.json), List[Definition]) docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(lemmas, definitions) # Put data in serialized files IOUtils.dump(output_processed_data_dir/f"src.txt", "".join([" ".join(self.get_input(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]), IOUtils.Format.txt) IOUtils.dump(output_processed_data_dir/f"tgt.txt", "".join([" ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]), IOUtils.Format.txt) return
def eval_impl(self, processed_data_dir: Path, model_dir: Path, beam_search_size: int, k: int ) -> List[List[Tuple[str, float]]]: from roosterize.ml.onmt.CustomTranslator import CustomTranslator from onmt.utils.misc import split_corpus from onmt.utils.parse import ArgumentParser from translate import _get_parser as translate_get_parser src_path = processed_data_dir/"src.txt" tgt_path = processed_data_dir/"tgt.txt" best_step = IOUtils.load(model_dir/"best-step.json", IOUtils.Format.json) self.logger.info(f"Taking best step at {best_step}") candidates_logprobs: List[List[Tuple[List[str], float]]] = list() with IOUtils.cd(self.open_nmt_path): parser = translate_get_parser() opt = parser.parse_args( f" -model {model_dir}/models/ckpt_step_{best_step}.pt" f" -src {src_path}" f" -tgt {tgt_path}" ) opt.output = f"{model_dir}/last-pred.txt" opt.beam_size = beam_search_size opt.gpu = 0 if torch.cuda.is_available() else -1 opt.n_best = k opt.block_ngram_repeat = 1 opt.ignore_when_blocking = ["_"] # translate.main ArgumentParser.validate_translate_opts(opt) translator = CustomTranslator.build_translator(opt, report_score=False) src_shards = split_corpus(opt.src, opt.shard_size) tgt_shards = split_corpus(opt.tgt, opt.shard_size) if opt.tgt is not None else repeat(None) shard_pairs = zip(src_shards, tgt_shards) for i, (src_shard, tgt_shard) in enumerate(shard_pairs): self.logger.info("Translating shard %d." % i) _, _, candidates_logprobs_shard = translator.translate( src=src_shard, tgt=tgt_shard, src_dir=opt.src_dir, batch_size=opt.batch_size, attn_debug=opt.attn_debug ) candidates_logprobs.extend(candidates_logprobs_shard) # end for # end with # Reformat candidates candidates_logprobs: List[List[Tuple[str, float]]] = [[("".join(c), l) for c, l in cl] for cl in candidates_logprobs] return candidates_logprobs
def get_baseline_preds(ref_modelname, data_mode): filename = f"{DATA_DIR}/{ref_modelname}/src.prevassign.{data_mode}.txt" preds: List[str] = [x.strip() for x in IOUtils.load(filename, IOUtils.Format.txt).splitlines()] preds_list = list() for pred in preds: if pred=="<empty>": preds_list.append(pred) else: preds_list.append(pred.split("<= ")[1].split(";")[0].strip()) return preds_list
def convert_json2txt(config_dict, data_types: List[str] = None): if data_types is None: data_types = ["train", "val", "test"] for data_type in data_types: data_list = IOUtils.load( os.path.join(DATADIR, config_dict["intermediate_data_dir"], f"{data_type}.json"), IOUtils.Format.json) for src_type in config_dict["src_types"]: output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.{src_type}.{data_type}.txt") pa_i = int(config_dict["augment"]) if src_type == "l": field = "l" elif src_type == "type": field = "l-type" elif src_type == "prevassign": field = f"pa{pa_i}" elif src_type == "patype": field = f"pa{pa_i}-type" else: raise ValueError(f"Unknown src_type {src_type}") # end if with open(output_path, "w") as f: for data in data_list: if len(data[field]) == 0: if field.endswith("-type"): f.write("<pad>\n") else: f.write("<empty>\n") # end if else: f.write(data[field] + "\n") # end if # end for # end with # end for fn_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"src.fn.{data_type}.txt") IOUtils.dump(fn_output_path, "".join([data["file_sha"] + "\n" for data in data_list]), IOUtils.Format.txt) tgt_output_path = os.path.join(DATADIR, config_dict["save_dir"], f"tgt.{data_type}.txt") # [3:-2]: remove prefix "<= " and suffix " ;" IOUtils.dump(tgt_output_path, "".join([data["r"][3:-2] + "\n" for data in data_list]), IOUtils.Format.txt) # end for print("Conversion into txt is done.") return
def collect_all_results(self, model: str, metrics: List[str]): # Mapping of eval_setting-year -> metric -> test_set -> [trials] all_results: Dict[str, Dict[str, Dict[str, List[any]]]] # Load existing results, if any results_file = Macros.results_dir / "metrics" / f"results-trials-{model}.json" if results_file.exists(): self.logger.info(f"Loading existing metrics from {results_file}") all_results = IOUtils.load(results_file) else: all_results = {} model_work_dir = Macros.data_dir / "models-work" / model for eval_setting in self.EVAL_SETTINGS: for year in self.YEARS: exp = f"{eval_setting}-{year}" exp_results = all_results.setdefault(exp, {}) for test_set in [Macros.test_common, Macros.test_standard]: set_results = exp_results.setdefault(test_set, {}) for trial in range(Macros.trials): trial_dir = model_work_dir / exp / f"trial-{trial}" cur_results_file = trial_dir / f"results_{test_set}.json" if not cur_results_file.exists(): self.logger.warning( f"Results not found at {cur_results_file}") # Set default value for set_results[mname], but don't touch existing results if any for mname in metrics: set_results.setdefault(mname, [None] * Macros.trials) else: results = IOUtils.load(cur_results_file) for mname in metrics: metric = results[mname] set_results.setdefault( mname, [None] * Macros.trials)[trial] = metric # Save extracted/updated results IOUtils.dump(results_file, all_results, IOUtils.Format.jsonPretty) return
def load(cls, path: Path) -> "Vocabulary": d = IOUtils.load(path, IOUtils.Format.json) v = Vocabulary(d["index_to_word"][str(VocabularyConsts.PAD_INDEX)], d["index_to_word"][str(VocabularyConsts.UNK_INDEX)]) for f in ["word_to_index", "index_to_word", "next_index", "counter"]: setattr(v, f, d[f]) # end for v.index_to_word = {int(k): v for k, v in v.index_to_word.items() } # Fix json key can only be string v.counter = collections.Counter(v.counter) # Fix Counter type return v
def load_data_list(cls, assignments_path: Path) -> List[Dict[str, List[str]]]: assignments = IOUtils.load(assignments_path, IOUtils.Format.json) # Flatten the dataset, remove file/entity structures data_list: List[Dict[str, List[str]]] = list() for f in assignments: file_names = f["fn"] # Currently, it's: "{sha}.asg, {sha}.typ" file_sha = file_names.split()[1][:-4] for ent in f["entity"]: var_types = ent["type"] var_raw_types = ent["raw_type"] assignments_this_entity = ent["agn"] for assignment in assignments_this_entity: data = dict() data["file_sha"] = [ file_sha ] # a singleton list rather than string, to be consistent with other fields data["l"] = assignment["l"] data["l-type"] = [ cls.get_one_type_token(data["l"], var_types) ] # One type for entire lhs data["l-type-each-token"] = cls.get_type_tokens( data["l"], var_types ) # Get type for each token in lhs, used by the concat model data["l-raw-type"] = cls.get_raw_type_tokens( data["l"], var_raw_types) data["r"] = assignment["r"] pas = assignment["prevassign"] # Hack: [[""]] is actually fully empty if len(pas) == 1 and len(pas[0]) == 1 and len( pas[0][0]) == 0: pas = [] # end if for pa_i in range(Macros.MAX_PA_IN_MODEL): if pa_i < len(pas): # Hack: remove empty token ("") in pa data[f"pa{pa_i+1}"] = [ t for t in pas[-(pa_i + 1)] if t != "" ] else: data[f"pa{pa_i+1}"] = [] # end if data[f"pa{pa_i + 1}-type"] = cls.get_type_tokens( data[f"pa{pa_i+1}"], var_types) # end for data_list.append(data) # end for # end for # end for return data_list
def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None: """ :requires: the project is cloned and checked-out to the desired version. """ if not project.is_cloned: project.clone() project.checkout(project.data["sha"], is_forced=True) # end if # Check if the project is already compiled confirmation_file = "lpc-installed.txt" confirmation_content = project.revision + " " + BashUtils.run("opam list coq -s", expected_return_code=0).stdout.strip() if (project.checkout_dir/confirmation_file).is_file() and IOUtils.load(project.checkout_dir/confirmation_file, "txt") == confirmation_content: cls.logger.debug(f"Project {project.full_name} already installed") return # end if project.clean() # Install dependencies for dependency in project.data.get("dependencies", []): dependency_project = names_projects.get(dependency) if dependency_project is None: raise Exception(f"Cannot find dependency {dependency}") cls.logger.info(f"For Project {project.full_name}, installing dependency {dependency}") cls.install_coq_project(dependency_project, names_projects) # end for if "build_cmd" not in project.data: raise Exception(f"Project {project.full_name} does not have build_cmd") if "install_cmd" not in project.data: raise Exception(f"Project {project.full_name} does not have install_cmd") with IOUtils.cd(project.checkout_dir): # Build cls.logger.info(f"Project {project.full_name}: Building with {project.data['build_cmd']}") r = BashUtils.run(project.data["build_cmd"]) if r.return_code != 0: raise Exception(f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") else: cls.logger.debug(f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") # end if # Install cls.logger.info(f"Project {project.full_name}: Installing with {project.data['install_cmd']}") r = BashUtils.run(project.data["install_cmd"]) if r.return_code != 0: raise Exception(f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") else: cls.logger.debug(f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}") # end if IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt") # end with return
def make_numbers_dataset_metrics(self): for task in Macros.tasks: file = latex.File(self.tables_dir / f"numbers-{task}-dataset-metrics.tex") dataset_metrics = IOUtils.load( Macros.results_dir / "metrics" / f"{task}-dataset.json", IOUtils.Format.json) for k, v in dataset_metrics.items(): fmt = f",d" if type(v) == int else f",.2f" file.append_macro(latex.Macro(f"ds-{task}-{k}", f"{v:{fmt}}")) raw_dataset_metrics = IOUtils.load( Macros.results_dir / "metrics" / f"{task}-raw-dataset.json", IOUtils.Format.json) for k, v in raw_dataset_metrics.items(): fmt = f",d" if type(v) == int else f",.2f" file.append_macro( latex.Macro(f"raw-ds-{task}-{k}", f"{v:{fmt}}")) file.save() return
def prepare_configs_and_scripts(self, trials: List[int]): data_dir = self.work_dir / "data" base_config = IOUtils.load(self.base_config_file, IOUtils.Format.jsonPretty) for trial in trials: trial_dir = self.work_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) config = copy.copy(base_config) config["data_dir"] = str(data_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(trial_dir / "output.txt") config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) training_trace_file = trial_dir / "training-trace.json" train_script_file = trial_dir / f"{Macros.train}.sh" # The gpu-id argument is necessary for tensorflow, even if we are using CUDA_VISIBLE_DEVICES train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --train-log {training_trace_file} --gpu-id $1 &> {trial_dir}/log-{Macros.train}.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: output_file = trial_dir / f"output_{test_type}.txt" config["output"] = str(output_file) test_config_file = trial_dir / f"config_{test_type}.json" IOUtils.dump(test_config_file, config, IOUtils.Format.jsonPretty) test_script_file = trial_dir / f"{test_type}.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {test_config_file} --eval {data_dir}/{test_type}/test.token.code {data_dir}/{test_type}/test.token.sbt {data_dir}/{test_type}/test.token.nl --gpu-id $1 &> {trial_dir}/log-{test_type}.txt\n" \ f"python3 Bleu.py {data_dir}/{test_type}/test.token.nl {trial_dir}/output_{test_type}.txt {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) return
def parse_projects(cls, project_list_file): """ Parse the project list file provided by DeepCom and return the github url file. """ project_list = IOUtils.load(project_list_file, IOUtils.Format.txt).splitlines() git_urls = list() for project in project_list: project_name = project.split("_", 1) git_urls.append( f"https://github.com/{project_name[0]}/{project_name[1]}.git") IOUtils.dump(Macros.data_dir / "DeepCom-projects-github.txt", "".join([url + "\n" for url in git_urls]), IOUtils.Format.txt)
def process_data_concurrent(self, proj_list: Path): """Process data concurrently.""" #projects = [Path(data_dir/proj) for proj in listdir(data_dir)] projects = IOUtils.load(proj_list) num_proj = len(projects) processed = 0 with ThreadPoolExecutor(8) as executor: futures = [ executor.submit(self.process_data, Macros.repos_results_dir / proj) for proj in projects ] for f in tqdm(as_completed(futures), total=num_proj): pass
def make_numbers_timewise_dataset_metrics(self, dataset: str = "large"): file = latex.File(self.tables_dir / f"numbers-time-wise-{dataset}-dataset-metrics.tex") metrics = IOUtils.load( Macros.results_dir / "metrics" / f"time-wise-{dataset}-dataset-stats.json", IOUtils.Format.json) for t in metrics.keys(): for k, v in metrics[t].items(): file.append_macro(latex.Macro(f"{dataset}-{t}-{k}", f"{v}")) # end for file.save() return
def load_local_model(self, prj_root: Path) -> None: """ Try to load the local model, if it exists; otherwise do nothing. """ if self.model is None: local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root) if local_model_dir.is_dir(): model_spec = IOUtils.dejsonfy( IOUtils.load(local_model_dir / "spec.json", IOUtils.Format.json), ModelSpec, ) self.model = MLModels.get_model(local_model_dir, model_spec, is_eval=True)