def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" data_dir = self.work_dir / "data" self.logger.info( f"Preparing the data for {self.eval_setting} {self.year} at {data_dir}" ) IOUtils.rm_dir(data_dir) IOUtils.mk_dir(data_dir) # Copy train/val/test_common/test_standard data BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.train}/train {data_dir}/train", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.val}/valid {data_dir}/valid", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_common}/test {data_dir}/{Macros.test_common}", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/test {data_dir}/{Macros.test_standard}", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/vocab* {data_dir}/", expected_return_code=0) return
def suggest_naming(self, file_path: Path, prj_root: Optional[Path] = None): """ Processes a file to get its lemmas and runs the model to get predictions. """ # Figure out which project we're at, and then load configs if prj_root is None: prj_root = RoosterizeDirUtils.auto_infer_project_root(file_path) self.load_configs(prj_root) # Infer SerAPI options serapi_options = self.infer_serapi_options(prj_root) # If user provided compile_cmd, first compile the project if self.compile_cmd is not None: with IOUtils.cd(prj_root): BashUtils.run(self.compile_cmd, expected_return_code=0) # Parse file data = self.parse_file(file_path, prj_root, serapi_options) # Load model self.load_local_model(prj_root) model = self.get_model() # Use the model to make predictions # Temp dirs for processed data and results temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize")) # Dump lemmas & definitions temp_raw_data_dir = temp_data_dir / "raw" temp_raw_data_dir.mkdir() IOUtils.dump( temp_raw_data_dir / "lemmas.json", IOUtils.jsonfy(data.lemmas), IOUtils.Format.json, ) IOUtils.dump( temp_raw_data_dir / "definitions.json", IOUtils.jsonfy(data.definitions), IOUtils.Format.json, ) # Model-specific process temp_processed_data_dir = temp_data_dir / "processed" temp_processed_data_dir.mkdir() model.process_data_impl(temp_raw_data_dir, temp_processed_data_dir) # Invoke eval candidates_logprobs = model.eval_impl( temp_processed_data_dir, beam_search_size=self.beam_search_size, k=self.k, ) # Save predictions IOUtils.rm_dir(temp_data_dir) # Report predictions self.report_predictions(data, candidates_logprobs) return
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" data_dir = self.work_dir / "data" self.logger.info( f"Preparing the data for {self.eval_setting} {self.year} at {self.work_dir}" ) IOUtils.rm_dir(data_dir) IOUtils.mk_dir(data_dir) # Copy train/val/test_common/test_standard data BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.train.c2s {self.work_dir}/data/", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.val}/code2seq.val.c2s {self.work_dir}/data/", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_common}.c2s", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/code2seq.test.c2s {self.work_dir}/data/code2seq.{Macros.test_standard}.c2s", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/code2seq.dict.c2s {self.work_dir}/data/", expected_return_code=0) return
def download_global_model(self, force_yes: bool = False): """ Downloads a global Roosterize model. """ global_model_dir = RoosterizeDirUtils.get_global_model_dir() if global_model_dir.exists(): ans = self.ask_for_confirmation( f"A Roosterize model already exists at {global_model_dir}. " f"Do you want to delete it and download again?") if force_yes: ans = True if ans != True: return IOUtils.rm_dir(global_model_dir) self.show_message("Downloading Roosterize model...") # Download and unpack temp_model_dir = Path(tempfile.mkdtemp(prefix="roosterize")) urllib.request.urlretrieve(self.model_url, str(temp_model_dir / "model.tgz")) with IOUtils.cd(temp_model_dir): BashUtils.run("tar xzf model.tgz", expected_return_code=0) # Move the stuff to global model place shutil.move(str(Path.cwd() / "model"), global_model_dir) # Delete temp dir IOUtils.rm_dir(temp_model_dir) self.show_message("Finish downloading Roosterize model.")
def train( self, train_processed_data_dir: Path, val_processed_data_dir: Path, force_retrain: bool = False, ) -> NoReturn: """ Trains the model on the training data. The trained model should be saved to output_dir. This function auto-saves a training-completed.txt as a proof of completion of training at the end. :param train_processed_data_dir: the directory containing the processed train data :param val_processed_data_dir: the directory containing the processed val data :param force_retrain: if set to True, re-train the model even if it was already trained (will remove previously trained model) """ if force_retrain or not self.is_training_completed(): self.logger.info(self.logging_prefix + f"Training model at {self.model_dir}; train: {train_processed_data_dir}, val: {val_processed_data_dir}") IOUtils.rm_dir(self.model_dir) IOUtils.mk_dir(self.model_dir) # Save spec & configs of this model IOUtils.dump(self.model_dir/"config-dict.json", IOUtils.jsonfy(self.config), IOUtils.Format.jsonPretty) IOUtils.dump(self.model_dir/"spec.json", IOUtils.jsonfy(self.spec), IOUtils.Format.jsonPretty) self.train_impl(train_processed_data_dir, val_processed_data_dir) IOUtils.dump(self.model_dir / self.TRAINING_COMPLETED_FILE_NAME, str(time.time_ns()), IOUtils.Format.txt) # end if return
def __init__(self, database: Database): self.database = database self.output_dir = Macros.data_dir / "split" IOUtils.rm_dir(self.output_dir) IOUtils.mk_dir(self.output_dir) self.statistics = dict() return
def prepare_code(self): IOUtils.rm_dir(self.code_dir) IOUtils.mk_dir(self.code_dir.parent) with IOUtils.cd(self.code_dir.parent): BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0) # end with with IOUtils.cd(self.code_dir): BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0) # end with return
def prepare_code(self): IOUtils.rm_dir(self.code_dir) IOUtils.mk_dir(self.code_dir.parent) with IOUtils.cd(self.code_dir.parent): BashUtils.run(f"git clone {self.REPO_URL} {self.code_dir.name}", expected_return_code=0) # end with with IOUtils.cd(self.code_dir): BashUtils.run(f"git checkout {self.REPO_SHA}", expected_return_code=0) # end with # copy eval code BashUtils.run(f"cp {Macros.this_dir}/eval/eval_utils.py {self.code_dir}/") return
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" IOUtils.rm_dir(self.data_dir) IOUtils.mk_dir(self.data_dir) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/transformer.* {self.data_dir}/", expected_return_code=0) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/src-test.txt {self.data_dir}/src-{Macros.test_common}.txt", expected_return_code=0) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_common}.txt", expected_return_code=0) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/src-test.txt {self.data_dir}/src-{Macros.test_standard}.txt", expected_return_code=0) BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_standard}.txt", expected_return_code=0) return
def prepare_data(self): if not self.use_latest: for t in range(13, 18): exp_dir = self.work_dir / f"{t}{t+1}-train" self.logger.info( f"Preparing the data for {t}-{t+1} at {exp_dir}") IOUtils.rm_dir(exp_dir) IOUtils.mk_dir(exp_dir) # Copy train data BashUtils.run( f"cp -r {self.model_data_dir}/20{t}-20{t+1}-train/train {exp_dir}/", expected_return_code=0) # Copy val test data BashUtils.run( f"cp -r {self.model_data_dir}/20{t+1}-20{t+2}-val/valid {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/20{t+2}-20{t+3}-test/test {exp_dir}/", expected_return_code=0) # Copy vocab BashUtils.run( f"cp {self.model_data_dir}/20{t}-20{t+1}-train/vocab* {exp_dir}/", expected_return_code=0) # end for # end for else: exp_dir = self.work_dir / "latest" IOUtils.rm_dir(exp_dir) IOUtils.mk_dir(exp_dir) # Copy Train data BashUtils.run( f"cp -r {self.model_data_dir}/latest/train {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/latest/valid {exp_dir}/", expected_return_code=0) BashUtils.run( f"cp -r {self.model_data_dir}/latest/test {exp_dir}/", expected_return_code=0) # Copy vocab BashUtils.run(f"cp {self.model_data_dir}/latest/vocab* {exp_dir}/", expected_return_code=0) return
def improve_project_model(self, prj_root: Optional[Path]): if prj_root is None: prj_root = RoosterizeDirUtils.auto_infer_project_root() # Deactivate loaded model self.model = None # Delete existing local model local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root) if local_model_dir.exists(): ans = self.ask_for_confirmation( f"A Roosterize model already exists at {local_model_dir}" f"Do you want to delete it and train again?") if not ans: return else: IOUtils.rm_dir(local_model_dir) # Copy global model to local model, but remove "training complete" marker global_model_dir = RoosterizeDirUtils.get_global_model_dir() if not global_model_dir.exists(): raise Exception( "Global Roosterize model not found! Please download model first." ) shutil.copytree(global_model_dir, local_model_dir) # Load local model self.load_local_model(prj_root) model = self.get_model() # Collect all lemmas in this project temp_data_dir = Path(tempfile.mkdtemp(prefix="roosterize")) DataMiner.extract_data_project( prj_root, files=None, exclude_files=self.exclude_files, exclude_pattern=self.exclude_pattern, serapi_options=self.infer_serapi_options(prj_root), output_path=temp_data_dir) # TODO: Split data into train/val set, then process each data (no pre-processing / rebuilding vocab!) # TODO: Train model # Delete temp file IOUtils.rm_dir(temp_data_dir)
def process_data( self, data_dir: Path, output_processed_data_dir: Path, is_train: bool = False, ) -> NoReturn: """ Processes the data to the intermediate format. """ self.logger.info(self.logging_prefix + f"Processing data from {data_dir} to {output_processed_data_dir}") IOUtils.rm_dir(output_processed_data_dir) IOUtils.mk_dir(output_processed_data_dir) if is_train: # Preprocess with training data, if needed self.preprocess_with_train_data(data_dir, output_processed_data_dir) # end if self.process_data_impl(data_dir, output_processed_data_dir) return
def collect_project(self, project_name: str, project_url: str): Environment.require_collector() # 0. Download repo downloads_dir = self.repos_downloads_dir / project_name results_dir = self.repos_results_dir / project_name # Remove previous results if any IOUtils.rm_dir(results_dir) IOUtils.mk_dir(results_dir) # Clone the repo if not exists if not downloads_dir.exists(): with IOUtils.cd(self.repos_downloads_dir): with TimeUtils.time_limit(300): BashUtils.run(f"git clone {project_url} {project_name}", expected_return_code=0) # end with # end with # end if project_data = ProjectData.create() project_data.name = project_name project_data.url = project_url # 1. Get list of revisions with IOUtils.cd(downloads_dir): git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'", expected_return_code=0).stdout for line in git_log_out.splitlines()[:self.MAX_REVISIONS]: shas = line.split() project_data.revisions.append(shas[0]) project_data.parent_revisions[shas[0]] = shas[1:] # end for # end with # 2. Get revisions in different year with IOUtils.cd(downloads_dir): for year in self.YEARS: git_log_out = BashUtils.run( f"git rev-list -1 --before=\"Jan 1 {year}\" origin", expected_return_code=0).stdout project_data.year_revisions[str(year) + "_Jan_1"] = git_log_out.rstrip() # end for # end with project_data_file = results_dir / "project.json" IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data), IOUtils.Format.jsonPretty) # 2. Start java collector # Prepare config log_file = results_dir / "collector-log.txt" output_dir = results_dir / "collector" config = { "collect": True, "projectDir": str(downloads_dir), "projectDataFile": str(project_data_file), "logFile": str(log_file), "outputDir": str(output_dir), "year": True # To indicate whether to collect all evo data or yearly data } config_file = results_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # 3. In some cases, save collected data to appropriate location or database # TODO private info # On luzhou server for user pynie, move it to a dedicated location at /user/disk2 if BashUtils.run( f"hostname").stdout.strip() == "luzhou" and BashUtils.run( f"echo $USER").stdout.strip() == "pynie": alter_results_dir = Path( "/home/disk2/pynie/csevo-results") / project_name IOUtils.rm_dir(alter_results_dir) IOUtils.mk_dir(alter_results_dir.parent) BashUtils.run(f"mv {results_dir} {alter_results_dir}") self.logger.info(f"Results moved to {alter_results_dir}") # end if # -1. Remove repo IOUtils.rm_dir(downloads_dir) return
def process_data(self, method_data_list, data_type, output_dir, traversal) -> List[int]: self.logger.info(f"Start processing") # Use DeepCom's required names data_type = { Macros.train: "train", Macros.val: "valid", Macros.test: "test", "debug": "debug", }[data_type] # Initialize vocab, error_ids (shared between processes) manager = multiprocessing.Manager() code_vocab = manager.dict() nl_vocab = manager.dict() sbt_vocab = manager.dict() vocabs_lock = manager.Lock() error_ids = manager.list() error_ids_lock = manager.Lock() # Multi-processing, split the tasks evenly tasks_each_process = len( method_data_list) // Macros.multi_processing + 1 processes = list() for pid in range(Macros.multi_processing): beg = pid * tasks_each_process method_data_list_p = method_data_list[beg:beg + tasks_each_process] output_dir_p = output_dir / str(pid) IOUtils.mk_dir(output_dir_p) process = multiprocessing.Process( target=self.process_data_mp, args=(method_data_list_p, data_type, output_dir_p, pid, beg, code_vocab, nl_vocab, sbt_vocab, vocabs_lock, error_ids, error_ids_lock, traversal)) process.start() processes.append(process) # end for for process in processes: process.join() # end for # Merge results code_file_name = data_type + ".token.code" nl_file_name = data_type + ".token.nl" sbt_file_name = data_type + ".token.sbt" data_type_output_dir = output_dir / data_type IOUtils.mk_dir(data_type_output_dir) for pid in range(Macros.multi_processing): for fname in [code_file_name, nl_file_name, sbt_file_name]: BashUtils.run( f"cat {output_dir}/{pid}/{fname} >> {data_type_output_dir}/{fname}" ) # end for IOUtils.rm_dir(output_dir / str(pid)) # end for error_ids.sort() # Build vocab if data_type == "train": code_vocab_file = output_dir / "vocab.code" nl_vocab_file = output_dir / "vocab.nl" sbt_vocab_file = output_dir / "vocab.sbt" fcv = open(code_vocab_file, "w+") fnv = open(nl_vocab_file, "w+") fsv = open(sbt_vocab_file, "w+") # write vocab to files special_tokens = [ '<S>', '</S>', '<UNK>', '<KEEP>', '<DEL>', '<INS>', '<SUB>', '<NONE>' ] # Filter based on frequency, keep first MAX_VOCAB code_vocabs_list = special_tokens + list( code_vocab.keys())[:self.MAX_VOCAB] nl_vocabs_list = special_tokens + list( nl_vocab.keys())[:self.MAX_VOCAB] sbt_vocabs_list = special_tokens + list( sbt_vocab.keys())[:self.MAX_VOCAB] for v in code_vocabs_list: fcv.write(v + "\n") for v in nl_vocabs_list: fnv.write(v + "\n") for v in sbt_vocabs_list: fsv.write(v + "\n") fcv.close() fsv.close() fnv.close() # end if return list(error_ids)