def get_cur_cluster(cls) -> str: hostname = BashUtils.run(f"hostname").stdout.strip() if hostname.endswith("maverick2.tacc.utexas.edu"): return cls.maverick2 elif hostname.endswith("stampede2.tacc.utexas.edu"): return cls.stampede2 else: cls.logger.warning("Currently not on TACC") return cls.maverick2
def test_propagate_env(self): del os.environ[self.TEST_ENV_A_KEY] self.assertTrue(self.TEST_ENV_A_KEY not in os.environ) self.assertEqual( self.TEST_ENV_A_VALUE, BashUtils.run( f"export {self.TEST_ENV_A_KEY}={self.TEST_ENV_A_VALUE}; echo -n ${self.TEST_ENV_A_KEY}", is_update_env=True).stdout) self.assertEqual(self.TEST_ENV_A_VALUE, os.environ[self.TEST_ENV_A_KEY])
def require_special_repo(cls, directory: Path, branch: str): cls.logger.info(f"Updating {directory} to {branch} branch") if directory.exists(): if not directory.is_dir() or not (directory / ".git").is_dir(): LoggingUtils.log_and_raise( cls.logger, f"Path {directory} already exists but is not a proper git repository!", Exception) # end if with IOUtils.cd(directory): BashUtils.run(f"git pull", expected_return_code=0) # end with else: IOUtils.mk_dir(directory) with IOUtils.cd(directory): BashUtils.run( f"git clone --single-branch -b {branch} -- {cls.get_git_url()} .", expected_return_code=0)
def tacc_submit_jobs(cls, submit_script: Path, titles: List[str], scripts: List[Path], timeouts: List[str], output_dir: Path, submit_cd: int = 600, max_jobs: int = 4): job_i = 0 while job_i < len(scripts): if cls.tacc_get_num_jobs() >= max_jobs: cls.logger.warning( f"Number of running jobs reach limit {max_jobs}, will retry after {submit_cd} seconds at {time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.localtime(time.time()+submit_cd))}" ) time.sleep(submit_cd) continue # end if title = titles[job_i] script = scripts[job_i] timeout = timeouts[job_i] cls.logger.info(f"Submitting script {script}") try: BashUtils.run( f"{submit_script} \"{title}\" \"{output_dir}\" \"{script}\" \"{timeout}\"", expected_return_code=0) except KeyboardInterrupt: cls.logger.warning(f"Keyboard interrupt!") break except: cls.logger.warning( f"Failed to submit, will retry after {submit_cd} seconds at {time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.localtime(time.time()+submit_cd))}" ) time.sleep(submit_cd) continue # end try # Submit successfully job_i += 1 # end while return
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" IOUtils.rm_dir(self.data_dir) IOUtils.mk_dir(self.data_dir) # build dataset used by Open-NMT BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/biLSTM* {self.data_dir}/", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/src-test.txt {self.data_dir}/src-{Macros.test_common}.txt", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_common}.txt", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/src-test.txt {self.data_dir}/src-{Macros.test_standard}.txt", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_standard}.txt", expected_return_code=0) return
def prepare_configs_and_scripts(self, trials: List[int]): exp_dir = self.work_dir for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) model_dir = trial_dir / "models" IOUtils.mk_dir(model_dir) log_dir = trial_dir / "logs" IOUtils.mk_dir(log_dir) data = str(exp_dir / "data/code2seq") val_data = data + ".val.c2s" train_log = trial_dir / "training-trace.json" train_script_file = trial_dir / f"{Macros.train}.sh" # Copy config file BashUtils.run( f"cp {self.base_config_file} {trial_dir}/config.yaml", expected_return_code=0) output_file = trial_dir / "output_tmp.txt" reference_file = trial_dir / "ref_tmp.txt" config_file = trial_dir / "config.yaml" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s" output_file = trial_dir / f"output_{test_type}.txt" reference_file = trial_dir / f"ref_{test_type}.txt" test_script_file = trial_dir / f"{test_type}.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) return
def process_data(self, method_data_list: List[MethodData], data_type: str, output_dir: Path, split: bool = True): Environment.require_collector() log_file = output_dir / "collector-log.txt" data_file = output_dir / "method-data.json" IOUtils.dump(data_file, IOUtils.jsonfy(method_data_list), IOUtils.Format.json) config = { "transform": True, "model": "BiLSTM", "dataType": data_type, "dataFile": str(data_file), "logFile": str(log_file), "outputDir": str(output_dir), } config_file = output_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stdout: self.logger.warning(f"Stdout of collector:\n{rr.stdout}") # end if if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # build raw dataset if split: self.tokenizeFile(output_dir / f"{data_type}.raw.txt", data_type) else: self.noSplit(output_dir / f"{data_type}.raw.txt", data_type) error_ids = IOUtils.load(str(output_dir) + "-error-ids.json") print(f"Number of error id is: {len(error_ids)}") # BashUtils.run(f"rm {output_dir}-error-ids.json", expected_return_code=0) return error_ids
def prepare_code(self): IOUtils.rm_dir(self.code_dir) IOUtils.mk_dir(self.code_dir.parent) with IOUtils.cd(self.code_dir.parent): BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0) # end with with IOUtils.cd(self.code_dir): BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0) # end with # copy eval code BashUtils.run(f"cp {Macros.this_dir}/eval/eval_utils.py {self.code_dir}/") return
def process(self, model: str, output_dir: Path, task: str, year: int, eval_setting: str): """ Main entry for processors of different models. :param model: the model name, one of {"DeepCom", "ast-attendgru"} :param output_dir: the output directory (usually data/models) :param task: the task name, either "CG" or "MN" :param year: the year that the testing data should be on :param eval_setting: the evaluation setting, one of {"evo", "crossproj", "mixedproj"} """ assert year == self.EVO_YEARS[ -1] # TODO: Only support the latest year for now assert task in self.TASKS.keys() model_data_dir = output_dir / model if model == "DeepCom": from csevo.processor.DeepComProcessor import DeepComProcessor processor = DeepComProcessor() elif model == "DeepCom-Preorder": from csevo.processor.DeepComProcessor import DeepComProcessor processor = DeepComProcessor() elif model == "Bi-LSTM": from csevo.processor.BiLSTMProcessor import BiLSTMProcessor processor = BiLSTMProcessor() elif model == "no-split-Bi-LSTM": from csevo.processor.BiLSTMProcessor import BiLSTMProcessor processor = BiLSTMProcessor() elif model == "Transformer": from csevo.processor.TransformerProcessor import TransformerProcessor processor = TransformerProcessor() data_prefix = f"{eval_setting}-{year}" processor.process_data(model_data_dir, data_prefix) return elif model == "ASTAttendGRU": from csevo.processor.ASTAttendGRUProcessor import ASTAttendGRUProcessor processor = ASTAttendGRUProcessor() elif model == "Code2Seq": from csevo.processor.Code2SeqProcessor import Code2SeqProcessor processor = Code2SeqProcessor() else: raise ValueError(f"Illegal model {model}") # end if error_ids = None # Load dataset after split (from shared directory) shared_data_dir = output_dir / f"{task}-shared" self.logger.info(f"Loading dataset from {shared_data_dir}") data_type_2_data_list: Dict[str, List] = dict() data_type_2_data_list[Macros.test_common] = IOUtils.load( shared_data_dir / f"{year}-{Macros.test_common}.json", IOUtils.Format.json) for dt in [Macros.train, Macros.val, Macros.test_standard]: data_type_2_data_list[dt] = IOUtils.load( shared_data_dir / f"{eval_setting}-{year}-{dt}.json", IOUtils.Format.json) # Process each set for data_type, data_list in data_type_2_data_list.items(): sub_dir_name = f"{eval_setting}-{year}-{data_type}" if data_type in [Macros.test_common, Macros.test_standard]: data_type_tvt = Macros.test else: data_type_tvt = data_type model_dt_output_dir = model_data_dir / sub_dir_name IOUtils.mk_dir(model_dt_output_dir) if model == "DeepCom": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir, "sbt") elif model == "DeepCom-Preorder": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir, "Preorder") elif model == "Code2Seq": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir) elif model == "Bi-LSTM": processor.process_data(data_list, data_type_tvt, model_dt_output_dir) elif model == "no-split-Bi-LSTM": processor.process_data(data_list, data_type_tvt, model_dt_output_dir, split=False) if error_ids is not None: self.logger.warning(f"Error data count: {len(error_ids)}") IOUtils.dump(model_data_dir / f"error-ids-{sub_dir_name}.json", error_ids, IOUtils.Format.json) # extra step for Open-NMT data if model == "Bi-LSTM" or model == "no-split-Bi-LSTM": # build dataset used by Open-NMT BashUtils.run( f"onmt_preprocess -train_src {model_data_dir}/{eval_setting}-{year}-{Macros.train}/src-train.txt " f"-train_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.train}/tgt-train.txt " f"-valid_src {model_data_dir}/{eval_setting}-{year}-{Macros.val}/src-val.txt " f"-valid_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.val}/tgt-val.txt " f"-save_data {model_data_dir}/{eval_setting}-{year}-{Macros.train}/biLSTM --src_seq_length 200 --src_seq_" f"length_trunc 200", expected_return_code=0) return
def prepare_data(self): if not self.use_latest: for t in range(13, 18): exp_dir = self.work_dir / f"{t}{t+1}-train" self.logger.info( f"Preparing the data for {t}-{t+1} at {exp_dir}") IOUtils.rm_dir(exp_dir) IOUtils.mk_dir(exp_dir) # Copy train data BashUtils.run( f"cp -r {self.model_data_dir}/20{t}-20{t+1}-train/train {exp_dir}/", expected_return_code=0) # Copy val test data BashUtils.run( f"cp -r {self.model_data_dir}/20{t+1}-20{t+2}-val/valid {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/20{t+2}-20{t+3}-test/test {exp_dir}/", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/20{t}-20{t+1}-train/vocab* {exp_dir}/", expected_return_code=0) # end for # end for else: exp_dir = self.work_dir / "latest" IOUtils.rm_dir(exp_dir) IOUtils.mk_dir(exp_dir) # Copy Train data BashUtils.run( f"cp -r {self.model_data_dir}/latest/train {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/latest/valid {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/latest/test {exp_dir}/", expected_return_code=0) # Copy vocab BashUtils.run(f"cp {self.model_data_dir}/latest/vocab* {exp_dir}/", expected_return_code=0) return
def prepare_configs_and_scripts(self, trials: List[int]): base_config = IOUtils.load(self.base_config_file, IOUtils.Format.jsonPretty) if not self.use_latest: exps = [f"{t}{t+1}-train" for t in range(13, 18)] for exp in exps: exp_dir = self.work_dir / exp for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) output_file = trial_dir / "output.txt" config = copy.copy(base_config) config["data_dir"] = str(exp_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(output_file) config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) train_script_file = trial_dir / "train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) test_script_file = trial_dir / "test.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) eval_script_file = trial_dir / "val.sh" eval_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n" IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {eval_script_file}", expected_return_code=0) # end for # end for else: exp_dir = self.work_dir / "latest" for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) output_file = trial_dir / "output.txt" config = copy.copy(base_config) config["data_dir"] = str(exp_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(output_file) config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) train_script_file = trial_dir / "train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) test_script_file = trial_dir / "test.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) eval_script_file = trial_dir / "val.sh" eval_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n" IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {eval_script_file}", expected_return_code=0) return
def submit_script(cls, cluster: str, name: str, log_path: Path, script: str, queue: str = None, timeout: str = None, require_conda: bool = True, conda_env: str = None, modules: List[str] = None, ) -> int: # Get default values if modules is None: modules = TACCRunnerConsts.modules[cluster] # end if if queue is None: queue = TACCRunnerConsts.queue[cluster] # end if if timeout is None: timeout = TACCRunnerConsts.timeout[cluster] # end if if conda_env is None: conda_env = TACCRunnerConsts.conda_env[cluster] # end if # Prepare submit script IOUtils.mk_dir(log_path) s = f"""#!/bin/bash #SBATCH -J {name} # Job name #SBATCH -o {log_path}/%j.stdout # Name of stdout output file(%j expands to jobId) #SBATCH -e {log_path}/%j.stderr # Name of stderr output file(%j expands to jobId) #SBATCH -p {queue} # Queue name #SBATCH -N 1 # Total number of nodes requested #SBATCH -n 1 # Total number of mpi tasks requested #SBATCH -t {timeout} # Max run time (hh:mm:ss) #SBATCH [email protected] #SBATCH --mail-type=ALL # The next line is required if the user has more than one project #SBATCH -A {TACCRunnerConsts.allocation} # Allocation name to charge job against module reset module unload python2 """ for m in modules: s += f"module load {m}\n" # end for s += f""" module list echo "START: $(date)" # Launch serial code... # Do not use ibrun or any other MPI launcher """ if require_conda: s += f""" unset PYTHONPATH source {TACCRunnerConsts.conda_init_path[cluster]} conda activate {conda_env} """ s += f""" cd {Macros.python_dir} {script} echo "END: $(date)" """ # Submit the script submit_script = BashUtils.get_temp_file() IOUtils.dump(submit_script, s, IOUtils.Format.txt) receipt = BashUtils.run(f"sbatch {submit_script}", expected_return_code=0).stdout # Get job id as the last number in output job_id = int(receipt.splitlines()[-1].split()[-1]) # Save the script at log_path as well BashUtils.run(f"mv {submit_script} {log_path}/{job_id}.sh") return job_id
def collect_lemmas_doc( cls, doc: CoqDocument, ast_sexp_list: List[SexpNode], serapi_options: str, ) -> List[Lemma]: lemmas_doc: List[Lemma] = list() data_index = doc.get_data_index() # Maintain a stack of module modules: List[str] = list() # Prepare qualified name prefix qprefix_this_doc = "./" + doc.file_name[:-2] # Remove .v for m in cls.RE_PATH_TO_QUALIFIED_PREFIX.finditer(serapi_options): path = m.group("path") if path != ".": path = "./" + path qprefix = m.group("qprefix") if qprefix_this_doc.startswith(path): qprefix_this_doc = qprefix + qprefix_this_doc[len(path):] break # end if # end for if qprefix_this_doc.startswith("./"): qprefix_this_doc = qprefix_this_doc[len("./"):] qprefix_this_doc = qprefix_this_doc.replace("/", ".") for sent_i, sent in enumerate(doc.sentences): ast_sexp = ast_sexp_list[sent_i] vernac = SexpAnalyzer.analyze_vernac(ast_sexp) if vernac.vernac_type in cls.VTYPES_MODULE_BEG: # (VernacExpr()(VernacDefineModule() ( ( v ( Id <module name>)) ... # 0 1 2 20 21 22 220 2201 22011 module_name = vernac.vernac_sexp[2][2][0][1][ 1].content_no_quote modules.append(module_name) elif vernac.vernac_type in cls.VTYPES_MODULE_END: # (VernacExpr()(VernacEndSegment ( ( v ( Id <module name>)) ... # 0 1 2 20 21 210 2101 21011 try: module_name = vernac.vernac_sexp[2][1][0][1][ 1].content_no_quote except: print(vernac.vernac_sexp.pretty_format()) raise # end try if len(modules) > 0 and module_name == modules[-1]: modules.pop( ) # EndModule and EndSection share the same vernac type elif vernac.vernac_type in cls.VTYPES_LEMMA: # (VernacExpr()(VernacStartTheoremProof Lemma ( ( ( ( ( v ( Id <lemma name>)) # 0 1 2 20 21 22 2200000 2200001 22000011 lemma = Lemma() lemma.data_index = data_index lemma.name = vernac.vernac_sexp[2][2][0][0][0][0][1][ 1].content_no_quote lemma.qname = qprefix_this_doc + "." + ".".join(modules + [lemma.name]) # Find lemma content, after the first token matching the lemma name tok_i = 0 for tok in sent.tokens: if tok.content == lemma.name: break tok_i += 1 # end for if tok_i == len(sent.tokens): LoggingUtils.log_and_raise( cls.logger, f"Lemma name {lemma.name} didn't appear in the source code {sent.str_with_space()}", Exception) lemma.vernac_command = sent.tokens[:tok_i] lemma.statement = sent.tokens[tok_i + 1:] lemma.ast_sexp = vernac.vernac_sexp lemmas_doc.append(lemma) # end if # end for # Use sername to get the backend representations lemma_qnames: str = "".join([l.qname + "\n" for l in lemmas_doc]) lemma_qnames_file = BashUtils.get_temp_file() IOUtils.dump(lemma_qnames_file, lemma_qnames, IOUtils.Format.txt) lemma_qnames_backend_sexps_str: str = BashUtils.run( f"sername {serapi_options} --require-lib={qprefix_this_doc} {lemma_qnames_file}", expected_return_code=0).stdout IOUtils.rm(lemma_qnames_file) for qname_backend_sexp_str in lemma_qnames_backend_sexps_str.splitlines( ): qname, backend_sexp_str = qname_backend_sexp_str.split(":", 1) backend_sexp = SexpParser.parse(backend_sexp_str) for lemma in lemmas_doc: if lemma.qname == qname: lemma.backend_sexp = backend_sexp break # end if # end for # end for lemmas_doc = [l for l in lemmas_doc if l.backend_sexp is not None] return lemmas_doc
def test_inherit_env(self): os.environ[self.TEST_ENV_A_KEY] = self.TEST_ENV_A_VALUE self.assertEqual( self.TEST_ENV_A_VALUE, BashUtils.run(f"echo -n ${self.TEST_ENV_A_KEY}").stdout) return
def collect_coq_documents_project( cls, data_mgr: FilesManager, project: Project, names_projects: Dict[str, Project], files: List[str] = None, is_verifying_tokenizer: bool = False, ) -> List[CoqDocument]: coq_documents: List[CoqDocument] = list() # Clone and checkout repo project.clone() project.checkout(project.data["sha"], is_forced=True) # Build the project cls.install_coq_project(project, names_projects) # For each file, parse code to tokens with IOUtils.cd(project.checkout_dir): coq_files: List[str] = BashUtils.run( f"find -name '*.v' -type f").stdout.split("\n")[:-1] if files is not None: coq_files = [f for f in coq_files if f[2:] in files] # [2:] is to remove the ./ # end if re_ignore_path = re.compile( project.data["ignore_path_regex"] ) if "ignore_path_regex" in project.data else None for i, coq_file in enumerate(coq_files): try: coq_file = coq_file[2:] cls.logger.debug( f"File {i + 1}/{len(coq_files)}: {coq_file}") # Check if file is ignored if re_ignore_path is not None and re_ignore_path.fullmatch( coq_file): cls.logger.info(f"Ignoring file {coq_file}") continue # end if # Read file with open(coq_file, "r", newline="") as f: source_code = f.read() # end with # Get unicode offsets unicode_offsets = ParserUtils.get_unicode_offsets( source_code) # Save original file to original_files data_mgr.dump_data([ FilesManager.ORIGINAL_FILES, project.full_name, coq_file ], source_code, IOUtils.Format.txt) # Call SerAPI serapi_options = project.data.get("serapi_options", "") ast_sexp_str: str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {coq_file}", expected_return_code=0).stdout tok_sexp_str: str = BashUtils.run( f"sertok {serapi_options} -- {coq_file}", expected_return_code=0).stdout # Save ast sexp to dataset (.ast.sexp) data_mgr.dump_data([ FilesManager.RAW_FILES, project.full_name, coq_file[:-2] + ".ast.sexp" ], ast_sexp_str, IOUtils.Format.txt) # Save tok sexp to dataset (.tok.sexp) data_mgr.dump_data([ FilesManager.RAW_FILES, project.full_name, coq_file[:-2] + ".tok.sexp" ], tok_sexp_str, IOUtils.Format.txt) # Parse ast sexp ast_sexp_list: List[SexpNode] = SexpParser.parse_list( ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list( tok_sexp_str) # Verify the tokenizer if requested if is_verifying_tokenizer: if not cls.verify_tokenizer(tok_sexp_list, source_code, unicode_offsets): LoggingUtils.log_and_raise( cls.logger, "Tokenized content doesn't match original file!", Exception) # end if # end if # Parse the document coq_document = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets) # Save the parsed document (printed format) to raw_files data_mgr.dump_data( [FilesManager.RAW_FILES, project.full_name, coq_file], coq_document.str_with_space(), IOUtils.Format.txt) # Set meta data coq_document.file_name = coq_file coq_document.project_name = project.full_name coq_document.revision = project.revision coq_documents.append(coq_document) except KeyboardInterrupt: cls.logger.warning("Keyboard interrupt!") raise except: cls.logger.warning( f"File {coq_file} failed! Exception was: {traceback.format_exc()}" ) continue # end try # end for # end with return coq_documents
def install_coq_project(cls, project: Project, names_projects: Dict[str, Project]) -> None: """ :requires: the project is cloned and checked-out to the desired version. """ if not project.is_cloned: project.clone() project.checkout(project.data["sha"], is_forced=True) # end if # Check if the project is already compiled confirmation_file = "lpc-installed.txt" confirmation_content = project.revision + " " + BashUtils.run( "opam list coq -s", expected_return_code=0).stdout.strip() if (project.checkout_dir / confirmation_file).is_file() and IOUtils.load( project.checkout_dir / confirmation_file, "txt") == confirmation_content: cls.logger.debug(f"Project {project.full_name} already installed") return # end if project.clean() # Install dependencies for dependency in project.data.get("dependencies", []): dependency_project = names_projects.get(dependency) if dependency_project is None: raise Exception(f"Cannot find dependency {dependency}") cls.logger.info( f"For Project {project.full_name}, installing dependency {dependency}" ) cls.install_coq_project(dependency_project, names_projects) # end for if "build_cmd" not in project.data: raise Exception( f"Project {project.full_name} does not have build_cmd") if "install_cmd" not in project.data: raise Exception( f"Project {project.full_name} does not have install_cmd") with IOUtils.cd(project.checkout_dir): # Build cls.logger.info( f"Project {project.full_name}: Building with {project.data['build_cmd']}" ) r = BashUtils.run(project.data["build_cmd"]) if r.return_code != 0: raise Exception( f"Compilation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}" ) else: cls.logger.debug( f"Compilation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}" ) # end if # Install cls.logger.info( f"Project {project.full_name}: Installing with {project.data['install_cmd']}" ) r = BashUtils.run(project.data["install_cmd"]) if r.return_code != 0: raise Exception( f"Installation failed! Return code is {r.return_code}! stdout:\n{r.stdout}\n; stderr:\n{r.stderr}" ) else: cls.logger.debug( f"Installation finished. Return code is {r.return_code}. stdout:\n{r.stdout}\n; stderr:\n{r.stderr}" ) # end if IOUtils.dump(project.checkout_dir / confirmation_file, confirmation_content, "txt") # end with return
def get_num_running_jobs(cls) -> int: return int(BashUtils.run(f"squeue -u {TACCRunnerConsts.user} | wc -l", expected_return_code=0).stdout) - 1
def tacc_get_num_jobs(cls) -> int: return int( BashUtils.run(f"squeue -u {os.getenv('USER')} | wc -l").stdout) - 1
def collect_project(self, project_name: str, project_url: str): Environment.require_collector() # 0. Download repo downloads_dir = self.repos_downloads_dir / project_name results_dir = self.repos_results_dir / project_name # Remove previous results if any IOUtils.rm_dir(results_dir) IOUtils.mk_dir(results_dir) # Clone the repo if not exists if not downloads_dir.exists(): with IOUtils.cd(self.repos_downloads_dir): with TimeUtils.time_limit(300): BashUtils.run(f"git clone {project_url} {project_name}", expected_return_code=0) # end with # end with # end if project_data = ProjectData.create() project_data.name = project_name project_data.url = project_url # 1. Get list of revisions with IOUtils.cd(downloads_dir): git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'", expected_return_code=0).stdout for line in git_log_out.splitlines()[:self.MAX_REVISIONS]: shas = line.split() project_data.revisions.append(shas[0]) project_data.parent_revisions[shas[0]] = shas[1:] # end for # end with # 2. Get revisions in different year with IOUtils.cd(downloads_dir): for year in self.YEARS: git_log_out = BashUtils.run( f"git rev-list -1 --before=\"Jan 1 {year}\" origin", expected_return_code=0).stdout project_data.year_revisions[str(year) + "_Jan_1"] = git_log_out.rstrip() # end for # end with project_data_file = results_dir / "project.json" IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data), IOUtils.Format.jsonPretty) # 2. Start java collector # Prepare config log_file = results_dir / "collector-log.txt" output_dir = results_dir / "collector" config = { "collect": True, "projectDir": str(downloads_dir), "projectDataFile": str(project_data_file), "logFile": str(log_file), "outputDir": str(output_dir), "year": True # To indicate whether to collect all evo data or yearly data } config_file = results_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # 3. In some cases, save collected data to appropriate location or database # TODO private info # On luzhou server for user pynie, move it to a dedicated location at /user/disk2 if BashUtils.run( f"hostname").stdout.strip() == "luzhou" and BashUtils.run( f"echo $USER").stdout.strip() == "pynie": alter_results_dir = Path( "/home/disk2/pynie/csevo-results") / project_name IOUtils.rm_dir(alter_results_dir) IOUtils.mk_dir(alter_results_dir.parent) BashUtils.run(f"mv {results_dir} {alter_results_dir}") self.logger.info(f"Results moved to {alter_results_dir}") # end if # -1. Remove repo IOUtils.rm_dir(downloads_dir) return
def process_data(self, method_data_list, data_type, output_dir, traversal) -> List[int]: self.logger.info(f"Start processing") # Use DeepCom's required names data_type = { Macros.train: "train", Macros.val: "valid", Macros.test: "test", "debug": "debug", }[data_type] # Initialize vocab, error_ids (shared between processes) manager = multiprocessing.Manager() code_vocab = manager.dict() nl_vocab = manager.dict() sbt_vocab = manager.dict() vocabs_lock = manager.Lock() error_ids = manager.list() error_ids_lock = manager.Lock() # Multi-processing, split the tasks evenly tasks_each_process = len( method_data_list) // Macros.multi_processing + 1 processes = list() for pid in range(Macros.multi_processing): beg = pid * tasks_each_process method_data_list_p = method_data_list[beg:beg + tasks_each_process] output_dir_p = output_dir / str(pid) IOUtils.mk_dir(output_dir_p) process = multiprocessing.Process( target=self.process_data_mp, args=(method_data_list_p, data_type, output_dir_p, pid, beg, code_vocab, nl_vocab, sbt_vocab, vocabs_lock, error_ids, error_ids_lock, traversal)) process.start() processes.append(process) # end for for process in processes: process.join() # end for # Merge results code_file_name = data_type + ".token.code" nl_file_name = data_type + ".token.nl" sbt_file_name = data_type + ".token.sbt" data_type_output_dir = output_dir / data_type IOUtils.mk_dir(data_type_output_dir) for pid in range(Macros.multi_processing): for fname in [code_file_name, nl_file_name, sbt_file_name]: BashUtils.run( f"cat {output_dir}/{pid}/{fname} >> {data_type_output_dir}/{fname}" ) # end for IOUtils.rm_dir(output_dir / str(pid)) # end for error_ids.sort() # Build vocab if data_type == "train": code_vocab_file = output_dir / "vocab.code" nl_vocab_file = output_dir / "vocab.nl" sbt_vocab_file = output_dir / "vocab.sbt" fcv = open(code_vocab_file, "w+") fnv = open(nl_vocab_file, "w+") fsv = open(sbt_vocab_file, "w+") # write vocab to files special_tokens = [ '<S>', '</S>', '<UNK>', '<KEEP>', '<DEL>', '<INS>', '<SUB>', '<NONE>' ] # Filter based on frequency, keep first MAX_VOCAB code_vocabs_list = special_tokens + list( code_vocab.keys())[:self.MAX_VOCAB] nl_vocabs_list = special_tokens + list( nl_vocab.keys())[:self.MAX_VOCAB] sbt_vocabs_list = special_tokens + list( sbt_vocab.keys())[:self.MAX_VOCAB] for v in code_vocabs_list: fcv.write(v + "\n") for v in nl_vocabs_list: fnv.write(v + "\n") for v in sbt_vocabs_list: fsv.write(v + "\n") fcv.close() fsv.close() fnv.close() # end if return list(error_ids)
def get_git_url(cls): with IOUtils.cd(Macros.project_dir): return BashUtils.run(f"git config --get remote.origin.url", expected_return_code=0).stdout.strip()
def extract_data_project( cls, project_path: Path, files: Optional[List[str]], exclude_files: Optional[List[str]], exclude_pattern: Optional[str], serapi_options: str, output_path: Path, ): # 1. Prepare output path if output_path.is_dir(): cls.logger.warning( f"{output_path} already exists, will overwrite the files.") elif output_path.is_file(): LoggingUtils.log_and_raise( cls.logger, f"{output_path} already exists as a file. Aborting.", Exception) else: IOUtils.mk_dir(output_path) # end if # 2. Extract documents, tok.sexp and ast.sexp coq_documents: Dict[str, CoqDocument] = collections.OrderedDict() ast_sexp_lists: Dict[str, List[SexpNode]] = dict() tok_sexp_lists: Dict[str, List[SexpNode]] = dict() with IOUtils.cd(project_path): coq_files: List[str] = BashUtils.run( f"find -name '*.v' -type f").stdout.split("\n")[:-1] coq_files = [coq_file[2:] for coq_file in coq_files] if files is not None: coq_files = [f for f in coq_files if f in files] # end if if exclude_files is not None: coq_files = [f for f in coq_files if f not in exclude_files] # end if if exclude_pattern is not None: re_exclude_pattern = re.compile(exclude_pattern) coq_files = [ f for f in coq_files if not re_exclude_pattern.fullmatch(f) ] # end if for i, coq_file in enumerate(tqdm(coq_files)): try: # Read file with open(coq_file, "r", newline="") as f: source_code = f.read() # end with # Get unicode offsets unicode_offsets = ParserUtils.get_unicode_offsets( source_code) # Call SerAPI ast_sexp_str: str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {coq_file}", expected_return_code=0).stdout tok_sexp_str: str = BashUtils.run( f"sertok {serapi_options} -- {coq_file}", expected_return_code=0).stdout # Parse ast sexp ast_sexp_list: List[SexpNode] = SexpParser.parse_list( ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list( tok_sexp_str) # Parse the document coq_document = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets) # Set meta data coq_document.file_name = coq_file coq_document.project_name = project_path.name coq_documents[coq_file] = coq_document ast_sexp_lists[coq_file] = ast_sexp_list tok_sexp_lists[coq_file] = tok_sexp_list except KeyboardInterrupt: cls.logger.warning("Keyboard interrupt!") raise except: cls.logger.warning( f"File {coq_file} failed! Exception was: {traceback.format_exc()}" ) continue # end try # end for # 3. Extract and save lemmas and definitions lemmas: List[Lemma] = list() definitions: List[Definition] = list() # Increase recursion limit because the backend sexps are CRAZZZZY deep sys.setrecursionlimit(10000) for file_path, doc in tqdm(coq_documents.items()): ast_sexp_list = ast_sexp_lists[file_path] lemmas_doc = cls.collect_lemmas_doc(doc, ast_sexp_list, serapi_options) lemmas.extend(lemmas_doc) definitions_doc = cls.collect_definitions_doc( doc, ast_sexp_list) definitions.extend(definitions_doc) # end for IOUtils.dump(output_path / "lemmas.json", IOUtils.jsonfy(lemmas), IOUtils.Format.json) IOUtils.dump(output_path / "definitions.json", IOUtils.jsonfy(definitions), IOUtils.Format.json) # end with return
def run_models_local( self, mode: str, models: List[str], exps: List[str], trials: List[int], timeout_hour: Optional[int], beg: int = 0, cnt: int = -1, ): """ :param mode: train :param models: DeepCom :param exps: evolution-evolution ... :param trials: 0, 1, 2 ... :param timeout_hour: 24 """ if mode not in [Macros.train, Macros.test_common, Macros.test_standard]: raise ValueError(f"mode has to be one of {Macros.train}, {Macros.test_common}, {Macros.test_standard}") # end if assert beg >= 0 assert cnt >= -1 # Sort the models, exps, and trials lists to ensure the traversal order is stable models.sort() exps.sort() trials.sort() user = os.getenv("USER") home = os.getenv("HOME") re_work_dir = re.compile(rf"/work/\d+/{user}/maverick2") # Assuming each model uses one GPU total_script_cnt = 0 for model in models: model_work_dir = self.work_dir / model for exp in exps: for trial in trials: # Only output the jobs whose indexes are in the interval [beg, beg+cnt) if (total_script_cnt < beg) or (cnt > 0 and total_script_cnt >= beg + cnt): total_script_cnt += 1 continue trial_dir = model_work_dir/exp/f"trial-{trial}" # Modify the script to remove TACC stuff script = IOUtils.load(trial_dir/f"{mode}.sh", IOUtils.Format.txt) script = script.replace("\nmodule", "\n# module") script = re_work_dir.sub(home, script) # Replace the paths in config files as well orig_configs = dict() for config_file in trial_dir.glob("config*.json"): config_content = IOUtils.load(config_file, IOUtils.Format.txt) orig_configs[config_file] = config_content config_content = re_work_dir.sub(home, config_content) IOUtils.dump(config_file, config_content, IOUtils.Format.txt) # Try to execute the script try: self.logger.info(f"Executing: {script}") fd, fname = tempfile.mkstemp(suffix=".sh") IOUtils.dump(fname, script, IOUtils.Format.txt) os.close(fd) BashUtils.run(f"chmod +x {fname}", expected_return_code=0) BashUtils.run(f"{fname} 0\n", expected_return_code=0) except RuntimeError: traceback.print_exc() # Revert the config files for config_file, config_content in orig_configs.items(): IOUtils.dump(config_file, config_content, IOUtils.Format.txt) total_script_cnt += 1 return