def update(self, word: Text, repl_word: Text): word = ''.join(flat_hangeul(word)) value = self.uni_dict.get(word) repl_word = ''.join(flat_hangeul(repl_word)) if value: self.uni_dict[repl_word] = self.uni_dict.pop(word) else: raise ValueError("Specified word not exist in default unigram dictionary")
def search(self, word: Text): word = ''.join(flat_hangeul(word)) value = self.uni_dict.get(word) if value: return True else: return False
def infer(self, word: Text, **kwargs): suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = self.symspell.lookup(''.join(flat_hangeul(word)), suggestion_verbosity, self.max_edit_dist) if suggestions: word = list(suggestions[0].term) return merge_flatted_hangeul(word) return word
def train(self, corpus_path: str, save_path: str, unigram_dict_prefix: str, bigram_dict_prefix: str = None, **kwargs): self.symspell.create_dictionary(corpus_path) # 1) Unigram dict worddict = '' for key, count in self.symspell.words.items(): worddict += '{} {}\n'.format(''.join(flat_hangeul(key)), count) unigram_save_path = os.path.join(save_path, unigram_dict_prefix + '.txt') with open(unigram_save_path, 'w', encoding='utf-8') as file: for line in worddict: file.write(line) file.close() print("Total {} Unigrams are saved!".format( len(self.symspell.words.items()))) if bigram_dict_prefix: # 2) Bigram dict with open(corpus_path, 'r', encoding='utf-8') as file: corpus = file.readlines() corpus = [s.strip() for s in corpus] bi_count = self.count_bigrams(corpus, min_count=5) bi_dict = '' for key, count in bi_count.items(): s1, s2 = key.split(' ') bi_dict += '{} {} {}\n'.format(''.join(flat_hangeul(s1)), ''.join(flat_hangeul(s2)), count) bigram_save_path = os.path.join(save_path, bigram_dict_prefix + '.txt') with open(bigram_save_path, 'w', encoding='utf-8') as biFile: for line in bi_dict: biFile.write(line) biFile.close() print("Total {} bigrams are saved!".format(len(bi_count)))
def save_dict(self, local_path: str = None): worddict = '' for key, count in self.uni_dict.items(): worddict += '{} {}\n'.format(''.join(flat_hangeul(key)), count) if local_path: save_path = local_path else: save_path = self.path with open(save_path, 'w', encoding='utf-8') as file: for line in worddict: file.write(line) file.close()
def noise_maker(word: str, threshold: float = 0.3, noise_char_num: int = 2, method: str = 'g'): """ :param word: single word :param threshold: probability of generating noise :param noise_char_num: maximum number of noise character :param method: generation method, ("g": general, "ct": close typo, "d": delete character, "s": switch character) """ noise_dict_key = { 'g': noise_dict, 'ct': typo_pattern, 'd': null_dict, } if method not in ['g', 'ct', 'd', 's']: raise ValueError("specified generation method is not supported") assert threshold >= 0 assert threshold <= 1 text_tokenized = flat_hangeul(word) rand = random.random() if rand >= threshold: return text_tokenized, word else: sample_num_noise = random.choice(range(1, noise_char_num + 1)) rand_idx = random.sample(range(len(text_tokenized)), min(len(text_tokenized), sample_num_noise)) for idx in rand_idx: if method in ['g', 'ct', 'd']: n_dict = noise_dict_key[method] if text_tokenized[idx] in list(n_dict.keys()): text_tokenized[idx] = random.choice( n_dict[text_tokenized[idx]]) elif method == 's': text_tokenized = switch_char(text_tokenized, idx) return text_tokenized, merge_flatted_hangeul(text_tokenized)
def text_to_token(text: str) -> list: return flat_hangeul(text)
def create(self, word: Text, count: int = 1000): word = ''.join(flat_hangeul(word)) value = self.uni_dict.get(word) if not value: self.uni_dict[word] = count