def reduce_process(self): workers_folder = [os.path.join(self.main_folder, folder) for folder in os.listdir(self.main_folder)] sc.message("Processing files") print(workers_folder) reduced_folder = sc.check_folder(os.path.join(self.output_folder, "ner_encoded")) train_folder = sc.check_folder(os.path.join(reduced_folder, "train")) test_folder = sc.check_folder(os.path.join(reduced_folder, "test")) total_ads = 0 # Count lines for dataset_folder in workers_folder: for line in open(os.path.join(dataset_folder, "dataset.jsonl"), "r", encoding="utf-8"): total_ads += 1 test_size = int(total_ads * self.test_perc) train_size = total_ads - test_size total_ads = 0 # Save train/test set for dataset_folder in workers_folder: for line in open(os.path.join(dataset_folder, "dataset.jsonl"), "r", encoding="utf-8"): save_folder = train_folder if total_ads < train_size else test_folder with open(os.path.join(save_folder, "dataset.jsonl"), "a", encoding="utf-8") as js: js.write(line) total_ads += 1 # Copy maps maps_path = [os.path.join(workers_folder[0], file_name) for file_name in os.listdir(workers_folder[0]) if "dataset" not in file_name] for map in maps_path: copy(map, reduced_folder) sc.message("DONE! Save @{}".format(reduced_folder))
def __init__(self, model_folder: str = None, debug: bool = False): super().__init__() self.model_folder = sc.check_folder( os.path.join(model_folder, str(datetime.date(datetime.utcnow())))) self.model_folder = sc.check_folder( os.path.join(self.model_folder, self.id)) self.preload_maps() self.advertise_counter = 0 self.debug = debug
def __init__(self, main_folder: str, output_folder: str, debug: bool = False): self.main_folder = sc.check_folder( os.path.join(main_folder, self.today_date)) self.output_folder = sc.check_folder( os.path.join(output_folder, self.today_date)) self.debug = debug
def __init__(self, seq_max_len: int = 50, max_len_char: int = 10, model_folder: str = None, maps_folder: str = None, update_maps: bool = False): super().__init__() self.seq_max_len = seq_max_len self.max_len_char = max_len_char self.model_folder = sc.check_folder(os.path.join(model_folder, str(datetime.date(datetime.utcnow())))) self.model_folder = sc.check_folder(os.path.join(self.model_folder, self.id)) self.preload_maps(maps_folder) self.advertise_counter = 0 self.processed_counter = 0 self.update_maps = update_maps
def __init__(self, output_folder: str, properties_path: str, measure_exceptions: List[str], debug: bool = False): super().__init__(debug=debug) self.output_folder = sc.check_folder( os.path.join(output_folder, str(datetime.date(datetime.utcnow())))) self.output_folder = sc.check_folder( os.path.join(self.output_folder, self.id)) self.ner_dict: Dict[str, Set[str]] = self.preload_maps( folder=properties_path) self.reg_rules = self.build_regex() self.measure_map = self.generate_measure_map(self.ner_dict, measure_exceptions) self.non_measure_map = self.generate_non_measure_map() if self.debug: pprint(self.measure_map) pprint(self.non_measure_map)
def reduce_process(self): workers_folder = [ os.path.join(self.main_folder, folder) for folder in os.listdir(self.main_folder) ] sc.message("Processing files") print(workers_folder) reduced_folder = sc.check_folder( os.path.join(self.output_folder, "sequence")) with open(os.path.join(reduced_folder, "dataset.jsonl"), "a", encoding="utf-8") as js: for folderzin in workers_folder: for line in open(os.path.join(folderzin, "sequence_enriched.jsonl"), "r", encoding="utf-8"): js.write(line) sc.message("DONE !")
def reduce_process(self): workers_folder = [ os.path.join(self.main_folder, folder) for folder in os.listdir(self.main_folder) ] if self.debug: print("Paths being aggregated...") print(workers_folder) char2idx = set() word2idx = set() tag2idx = set() sc.message("Processing files...") for folderzin in workers_folder: tmp_chars = sc.load_json(os.path.join(folderzin, "char2idx.json")) tmp_words = sc.load_json(os.path.join(folderzin, "word2idx.json")) tmp_tags = sc.load_json(os.path.join(folderzin, "tag2idx.json")) for c in tmp_chars.keys(): char2idx.add(c) for w in tmp_words.keys(): word2idx.add(w) for t in tmp_tags.keys(): tag2idx.add(t) reduced_folder = sc.check_folder( os.path.join(self.output_folder, "ner_mapping")) basec2i = {"__PAD__": 0, "UNK": 1} basew2i = {"__PAD__": 0, "UNK": 1} baset2i = {"__PAD__": 0} char2idx.remove("__PAD__") char2idx.remove("UNK") word2idx.remove("__PAD__") word2idx.remove("UNK") tag2idx.remove("__PAD__") if "UNK" in tag2idx: tag2idx.remove("UNK") for i, c in enumerate(char2idx): basec2i[c] = i + 2 for i, c in enumerate(word2idx): basew2i[c] = i + 2 for i, c in enumerate(tag2idx): baset2i[c] = i + 1 sc.message("Saving chars") sc.save_dict_2json(os.path.join(reduced_folder, "char2idx.json"), basec2i) sc.message("Saving words") sc.save_dict_2json(os.path.join(reduced_folder, "word2idx.json"), basew2i) sc.message("Saving tags") sc.save_dict_2json(os.path.join(reduced_folder, "tag2idx.json"), baset2i)