def load_data(self, rel_path: Union[str, List[str]], fmt: IOUtils.Format, is_batched: bool = False, clz = None, ) -> Any: if self.is_json_format(fmt) and clz is None: self.logger.warning(f"Load data from {rel_path} with json format, but did not specify clz (at {traceback.format_stack()})") # end if abs_path = self.data_dir / self.assemble_rel_path(rel_path) if not abs_path.exists(): LoggingUtils.log_and_raise(self.logger, f"Cannot find data at {abs_path}", IOError) # end if if not is_batched: data = IOUtils.load(abs_path, fmt) if self.is_json_format(fmt) and clz is not None: data = IOUtils.dejsonfy(data, clz) # end if return data else: data = list() batch_numbers = sorted([int(str(f.stem).split("-")[1]) for f in abs_path.iterdir()]) for batch_number in tqdm(batch_numbers): batch_file = abs_path / f"batch-{batch_number}.{fmt.get_extension()}" data_batch = IOUtils.load(batch_file, fmt) if self.is_json_format(fmt) and clz is not None: data_batch = IOUtils.dejsonfy(data_batch, clz) # end if data.extend(data_batch) # end for return data
def test_dejsonfy_basic(self): self.assertEqual("aaa", IOUtils.dejsonfy("aaa")) self.assertEqual(42, IOUtils.dejsonfy(42)) self.assertEqual(1.111, IOUtils.dejsonfy(1.111)) self.assertEqual([1, 2.0, "ccc"], IOUtils.dejsonfy([1, 2.0, "ccc"])) self.assertEqual({"f1": 1, "f2": 2.0, "f3": "ccc"}, IOUtils.dejsonfy({"f1": 1, "f2": 2.0, "f3": "ccc"})) return
def process_data_impl( self, data_dir: Path, output_processed_data_dir: Path, ) -> NoReturn: lemmas: List[Lemma] = IOUtils.dejsonfy( IOUtils.load(data_dir / "lemmas.json", IOUtils.Format.json), List[Lemma]) definitions: List[Definition] = IOUtils.dejsonfy( IOUtils.load(data_dir / "definitions.json", IOUtils.Format.json), List[Definition]) docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers( lemmas, definitions) # Inputs all_inputs: Dict[str, List[List[str]]] = self.get_all_inputs( lemmas, docs_sub_tokenizers) for input_type, src_sentences in all_inputs.items(): IOUtils.dump( output_processed_data_dir / f"src.{input_type}.txt", "".join([" ".join(sent) + "\n" for sent in src_sentences]), IOUtils.Format.txt) # end for # Outputs IOUtils.dump( output_processed_data_dir / f"tgt.txt", "".join([ " ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas ]), IOUtils.Format.txt) super().process_data_impl(data_dir, output_processed_data_dir) return
def suggest_lemmas(**options): from roosterize.data.DataMiner import DataMiner from roosterize.data.ModelSpec import ModelSpec from roosterize.ml.MLModels import MLModels project_path = Path(options["project"]).absolute() files = Utils.get_option_as_list(options, "files", None) exclude_files = Utils.get_option_as_list(options, "exclude-files", None) exclude_pattern = options.get("exclude-pattern", None) serapi_options = options.get("serapi-options", "") output_dir = Path(options["output"]).absolute() model_dir = Path(options["model-dir"]).absolute() # Extract data print(">>>>> Extracting lemmas ...") DataMiner.extract_data_project(project_path, files, exclude_files, exclude_pattern, serapi_options, output_dir/"raw-data") # Get the ML model print(">>>>> Initializing model ...") model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec) model = MLModels.get_model(model_dir, model_spec, is_eval=True) # Process data print(">>>>> Processing data ...") model.process_data(output_dir/"raw-data", output_dir/"eval-processed-data") # Eval print(">>>>> Applying model ...") model.eval(output_dir/"eval-processed-data", output_dir/"eval-result") # Print suggestions print(">>>>> Suggestions:") print(IOUtils.load(output_dir/"eval-result"/"suggestions.txt", IOUtils.Format.txt)) return
def collect_data(cls, **options) -> NoReturn: data_mgr = FilesManager(cls.dataset_dir) task = options["task"] projects_path = Path(options.get("corpus", cls.dataset_dir / "projects-standalone-8.10.yml")) projects: List[Project] = IOUtils.dejsonfy(IOUtils.load(projects_path, "json"), Project) if task == cls.TASK_COQ_DOCUMENTS: files = Utils.get_option_as_list(options, "files", None) is_verifying_tokenizer = Utils.get_option_as_boolean(options, "verify-tokenizer") cls.collect_coq_documents_projects(data_mgr, projects, files, is_verifying_tokenizer) elif task == cls.TASK_DATA_INDEXES: cls.collect_data_indexes(data_mgr, projects) elif task == cls.TASK_DEFINITIONS: cls.collect_definitions(data_mgr) elif task == cls.TASK_INSTALL_COQ_PROJECTS: cls.install_coq_projects(projects) elif task == cls.TASK_LEMMA: files = Utils.get_option_as_list(options, "files", None) cls.collect_lemmas(data_mgr, projects, files) elif task == cls.TASK_LEMMA_BACKEND_SEXP_TRANSFORMATIONS: cls.collect_lemmas_backend_sexp_transformations(data_mgr) elif task == cls.TASK_LEMMA_FILTERED: cls.filter_lemmas(data_mgr) elif task == cls.TASK_LEMMA_FOREEND_SEXP_TRANSFORMATIONS: cls.collect_lemmas_foreend_sexp_transformations(data_mgr) else: LoggingUtils.log_and_raise(cls.logger, f"Unknown task {task}", ValueError) # end if return
def process_data_impl(self, data_dir: Path, output_processed_data_dir: Path, ) -> NoReturn: lemmas: List[Lemma] = IOUtils.dejsonfy(IOUtils.load(data_dir/"lemmas.json", IOUtils.Format.json), List[Lemma]) definitions: List[Definition] = IOUtils.dejsonfy(IOUtils.load(data_dir/"definitions.json", IOUtils.Format.json), List[Definition]) docs_sub_tokenizers = SubTokenizer.get_docs_sub_tokenizers(lemmas, definitions) # Put data in serialized files IOUtils.dump(output_processed_data_dir/f"src.txt", "".join([" ".join(self.get_input(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]), IOUtils.Format.txt) IOUtils.dump(output_processed_data_dir/f"tgt.txt", "".join([" ".join(self.get_output(lemma, docs_sub_tokenizers)) + "\n" for lemma in lemmas]), IOUtils.Format.txt) return
def __init__( self, model_dir: Path, model_spec: ModelSpec, config_clz: type, ): self.model_dir = model_dir self.spec = model_spec self.config: TConfig = IOUtils.dejsonfy(model_spec.config_dict, config_clz) if model_spec.config_dict is not None else config_clz() self.logger.info(f"{type(self).__name__} {self.spec.model} created with config {self.config}") return
def load_local_model(self, prj_root: Path) -> None: """ Try to load the local model, if it exists; otherwise do nothing. """ if self.model is None: local_model_dir = RoosterizeDirUtils.get_local_model_dir(prj_root) if local_model_dir.is_dir(): model_spec = IOUtils.dejsonfy( IOUtils.load(local_model_dir / "spec.json", IOUtils.Format.json), ModelSpec, ) self.model = MLModels.get_model(local_model_dir, model_spec, is_eval=True)
def get_model(self) -> NamingModelBase: """ Try to get the currently loaded model; if no model is loaded, gets the global model. The local model can be loaded by invoking load_local_model (before invoking this method). """ if self.model is None: # Load global model global_model_dir = RoosterizeDirUtils.get_global_model_dir() model_spec = IOUtils.dejsonfy( IOUtils.load(global_model_dir / "spec.json", IOUtils.Format.json), ModelSpec, ) self.model = MLModels.get_model(global_model_dir, model_spec, is_eval=True) return self.model
def eval_model(**options): from roosterize.data.ModelSpec import ModelSpec from roosterize.ml.MLModels import MLModels data_dir = Path(options["data"]).absolute() model_dir = Path(options["model-dir"]).absolute() output_dir = Path(options["output"]).absolute() # Get the ML model model_spec = IOUtils.dejsonfy(IOUtils.load(model_dir/"spec.json", IOUtils.Format.json), ModelSpec) model = MLModels.get_model(model_dir, model_spec, is_eval=True) # Process data model.process_data(data_dir, output_dir/"eval-processed-data") # Eval model.eval(output_dir/"eval-processed-data", output_dir/"eval-result") return
def test_dejsonfy_record_class(self): example_obj = test_IOUtils.ExampleRecordClass( field_str="aaa", field_int=42, field_int_2=66, field_list=[1, 2], nested_rc=test_IOUtils.ExampleSimpleRecordClass(f=225)) dejsonfied = IOUtils.dejsonfy( { "field_str": "aaa", "field_int": 42, "field_int_2": "66", "field_list": [1, 2], "nested_rc": { "f": 225 } }, test_IOUtils.ExampleRecordClass) self.assertEqual(example_obj, dejsonfied) return
def iter_batched_data( self, rel_path: Union[str, List[str]], fmt: IOUtils.Format, clz=None, ) -> Iterator: if self.is_json_format(fmt) and clz is None: logger.warning(f"Load data from {rel_path} with json format, but did not specify clz") abs_path = self.data_dir / self.assemble_rel_path(rel_path) if not abs_path.exists(): raise IOError(f"Cannot find data at {abs_path}") batch_numbers = sorted([int(str(f.stem).split("-")[1]) for f in abs_path.iterdir()]) for batch_number in batch_numbers: batch_file = abs_path / f"batch-{batch_number}.{fmt.get_extension()}" for data_entry in IOUtils.load(batch_file, fmt): if self.is_json_format(fmt) and clz is not None: data_entry = IOUtils.dejsonfy(data_entry, clz) # end if yield data_entry
def test_dejsonfy_seqs(self): self.assertEqual([1, 2, 3], IOUtils.dejsonfy([1, 2, 3], List[int])) self.assertEqual((1, 2, 3), IOUtils.dejsonfy([1, 2, 3], Tuple[int, int, int])) self.assertEqual({1, 2, 3}, IOUtils.dejsonfy([1, 2, 3], Set[int])) return
def test_dejsonfy_enum(self): example_obj = test_IOUtils.ExampleEnum.Item3 dejsonfied = IOUtils.dejsonfy(3, test_IOUtils.ExampleEnum) self.assertEqual(example_obj, dejsonfied) return
def generate_configs(cls, name: str, path: Path, **options): config_files: Set[str] = set() ml_model_clz = cls.NAMES_MODELS[name] config = ml_model_clz.config_clz() type_hints = get_type_hints(ml_model_clz.config_clz) model_path = path/name model_path.mkdir(parents=True, exist_ok=True) cls.logger.info(f"Possible attrs and default values: {config.__dict__}") attrs_choices: dict = dict() attrs: list = list() for k, default_v in config.__dict__.items(): attrs.append(k) if k not in options: attrs_choices[k] = [default_v] else: if type_hints[k] == bool: attrs_choices[k] = [v == "True" for v in str(options[k]).split()] elif issubclass(type_hints[k], recordclass.mutabletuple): attrs_choices[k] = [IOUtils.dejsonfy(v, type_hints[k]) if v != "None" else None for v in str(options[k]).split()] else: attrs_choices[k] = [type_hints[k](v) for v in str(options[k]).split()] # end if attrs_choices[k] = list(set(attrs_choices[k])) cls.logger.debug(f"attr {k}, choices: {attrs_choices[k]}") options.pop(k) # end if # end for if len(options) > 0: cls.logger.warning(f"These options are not recognized: {options.keys()}") # end if candidate = [0] * len(attrs_choices) is_explore_finished = False while True: # Generate current candidate for i, attr in enumerate(attrs): config.__setattr__(attr, attrs_choices[attr][candidate[i]]) # end for if config.repOk(): # Adjust batch size adjust_batch_size_func = getattr(config, "adjust_batch_size", None) if callable(adjust_batch_size_func): adjust_batch_size_func() # end if config_file = model_path / (str(config)+".json") cls.logger.info(f"Saving candidate to {config_file}: {config}") config_files.add(name + "/" + str(config) + ".json") IOUtils.dump(config_file, IOUtils.jsonfy(config), IOUtils.Format.jsonPretty) else: cls.logger.info(f"Skipping invalid candidate: {config}") # end if # To next candidate for i, attr in enumerate(attrs): candidate[i] += 1 if candidate[i] >= len(attrs_choices[attr]): candidate[i] = 0 if i == len(attrs) - 1: is_explore_finished = True break else: continue # end if else: break # end if # end for if is_explore_finished: break # end while for config_file in config_files: print(f"- model: {name}") print(f" config-file: {config_file}") print() # end for return