Beispiel #1
0
    def reduce_process(self):
        workers_folder = [os.path.join(self.main_folder, folder) for folder in os.listdir(self.main_folder)]

        sc.message("Processing files")
        print(workers_folder)

        reduced_folder = sc.check_folder(os.path.join(self.output_folder, "ner_encoded"))
        train_folder = sc.check_folder(os.path.join(reduced_folder, "train"))
        test_folder = sc.check_folder(os.path.join(reduced_folder, "test"))
        total_ads = 0

        # Count lines
        for dataset_folder in workers_folder:
            for line in open(os.path.join(dataset_folder, "dataset.jsonl"), "r", encoding="utf-8"):
                total_ads += 1

        test_size = int(total_ads * self.test_perc)
        train_size = total_ads - test_size
        total_ads = 0

        # Save train/test set
        for dataset_folder in workers_folder:
            for line in open(os.path.join(dataset_folder, "dataset.jsonl"), "r", encoding="utf-8"):
                save_folder = train_folder if total_ads < train_size else test_folder
                with open(os.path.join(save_folder, "dataset.jsonl"), "a", encoding="utf-8") as js:
                    js.write(line)
                    total_ads += 1

        # Copy maps
        maps_path = [os.path.join(workers_folder[0], file_name) for file_name in os.listdir(workers_folder[0]) if "dataset" not in file_name]
        for map in maps_path:
            copy(map, reduced_folder)

        sc.message("DONE! Save @{}".format(reduced_folder))
Beispiel #2
0
 def __init__(self, model_folder: str = None, debug: bool = False):
     super().__init__()
     self.model_folder = sc.check_folder(
         os.path.join(model_folder, str(datetime.date(datetime.utcnow()))))
     self.model_folder = sc.check_folder(
         os.path.join(self.model_folder, self.id))
     self.preload_maps()
     self.advertise_counter = 0
     self.debug = debug
Beispiel #3
0
 def __init__(self,
              main_folder: str,
              output_folder: str,
              debug: bool = False):
     self.main_folder = sc.check_folder(
         os.path.join(main_folder, self.today_date))
     self.output_folder = sc.check_folder(
         os.path.join(output_folder, self.today_date))
     self.debug = debug
 def __init__(self, seq_max_len: int = 50, max_len_char: int = 10, model_folder: str = None, maps_folder: str = None, update_maps: bool = False):
     super().__init__()
     self.seq_max_len = seq_max_len
     self.max_len_char = max_len_char
     self.model_folder = sc.check_folder(os.path.join(model_folder, str(datetime.date(datetime.utcnow()))))
     self.model_folder = sc.check_folder(os.path.join(self.model_folder, self.id))
     self.preload_maps(maps_folder)
     self.advertise_counter = 0
     self.processed_counter = 0
     self.update_maps = update_maps
 def __init__(self,
              output_folder: str,
              properties_path: str,
              measure_exceptions: List[str],
              debug: bool = False):
     super().__init__(debug=debug)
     self.output_folder = sc.check_folder(
         os.path.join(output_folder, str(datetime.date(datetime.utcnow()))))
     self.output_folder = sc.check_folder(
         os.path.join(self.output_folder, self.id))
     self.ner_dict: Dict[str, Set[str]] = self.preload_maps(
         folder=properties_path)
     self.reg_rules = self.build_regex()
     self.measure_map = self.generate_measure_map(self.ner_dict,
                                                  measure_exceptions)
     self.non_measure_map = self.generate_non_measure_map()
     if self.debug:
         pprint(self.measure_map)
         pprint(self.non_measure_map)
    def reduce_process(self):
        workers_folder = [
            os.path.join(self.main_folder, folder)
            for folder in os.listdir(self.main_folder)
        ]

        sc.message("Processing files")
        print(workers_folder)

        reduced_folder = sc.check_folder(
            os.path.join(self.output_folder, "sequence"))

        with open(os.path.join(reduced_folder, "dataset.jsonl"),
                  "a",
                  encoding="utf-8") as js:
            for folderzin in workers_folder:
                for line in open(os.path.join(folderzin,
                                              "sequence_enriched.jsonl"),
                                 "r",
                                 encoding="utf-8"):
                    js.write(line)

        sc.message("DONE !")
Beispiel #7
0
    def reduce_process(self):
        workers_folder = [
            os.path.join(self.main_folder, folder)
            for folder in os.listdir(self.main_folder)
        ]

        if self.debug:
            print("Paths being aggregated...")
            print(workers_folder)

        char2idx = set()
        word2idx = set()
        tag2idx = set()

        sc.message("Processing files...")

        for folderzin in workers_folder:
            tmp_chars = sc.load_json(os.path.join(folderzin, "char2idx.json"))
            tmp_words = sc.load_json(os.path.join(folderzin, "word2idx.json"))
            tmp_tags = sc.load_json(os.path.join(folderzin, "tag2idx.json"))

            for c in tmp_chars.keys():
                char2idx.add(c)

            for w in tmp_words.keys():
                word2idx.add(w)

            for t in tmp_tags.keys():
                tag2idx.add(t)

        reduced_folder = sc.check_folder(
            os.path.join(self.output_folder, "ner_mapping"))
        basec2i = {"__PAD__": 0, "UNK": 1}
        basew2i = {"__PAD__": 0, "UNK": 1}
        baset2i = {"__PAD__": 0}

        char2idx.remove("__PAD__")
        char2idx.remove("UNK")
        word2idx.remove("__PAD__")
        word2idx.remove("UNK")
        tag2idx.remove("__PAD__")
        if "UNK" in tag2idx:
            tag2idx.remove("UNK")

        for i, c in enumerate(char2idx):
            basec2i[c] = i + 2

        for i, c in enumerate(word2idx):
            basew2i[c] = i + 2

        for i, c in enumerate(tag2idx):
            baset2i[c] = i + 1

        sc.message("Saving chars")
        sc.save_dict_2json(os.path.join(reduced_folder, "char2idx.json"),
                           basec2i)
        sc.message("Saving words")
        sc.save_dict_2json(os.path.join(reduced_folder, "word2idx.json"),
                           basew2i)
        sc.message("Saving tags")
        sc.save_dict_2json(os.path.join(reduced_folder, "tag2idx.json"),
                           baset2i)