Esempio n. 1
0
    def update(self, word: Text, repl_word: Text):
        word = ''.join(flat_hangeul(word))
        value = self.uni_dict.get(word)

        repl_word = ''.join(flat_hangeul(repl_word))
        if value:
            self.uni_dict[repl_word] = self.uni_dict.pop(word)

        else:
            raise ValueError("Specified word not exist in default unigram dictionary")
Esempio n. 2
0
 def search(self, word: Text):
     word = ''.join(flat_hangeul(word))
     value = self.uni_dict.get(word)
     if value:
         return True
     else:
         return False
Esempio n. 3
0
 def infer(self, word: Text, **kwargs):
     suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
     suggestions = self.symspell.lookup(''.join(flat_hangeul(word)),
                                        suggestion_verbosity,
                                        self.max_edit_dist)
     if suggestions:
         word = list(suggestions[0].term)
         return merge_flatted_hangeul(word)
     return word
Esempio n. 4
0
    def train(self,
              corpus_path: str,
              save_path: str,
              unigram_dict_prefix: str,
              bigram_dict_prefix: str = None,
              **kwargs):
        self.symspell.create_dictionary(corpus_path)
        # 1) Unigram dict
        worddict = ''
        for key, count in self.symspell.words.items():
            worddict += '{} {}\n'.format(''.join(flat_hangeul(key)), count)

        unigram_save_path = os.path.join(save_path,
                                         unigram_dict_prefix + '.txt')
        with open(unigram_save_path, 'w', encoding='utf-8') as file:
            for line in worddict:
                file.write(line)
            file.close()
        print("Total {} Unigrams are saved!".format(
            len(self.symspell.words.items())))

        if bigram_dict_prefix:
            # 2) Bigram dict
            with open(corpus_path, 'r', encoding='utf-8') as file:
                corpus = file.readlines()
            corpus = [s.strip() for s in corpus]

            bi_count = self.count_bigrams(corpus, min_count=5)

            bi_dict = ''
            for key, count in bi_count.items():
                s1, s2 = key.split(' ')
                bi_dict += '{} {} {}\n'.format(''.join(flat_hangeul(s1)),
                                               ''.join(flat_hangeul(s2)),
                                               count)

            bigram_save_path = os.path.join(save_path,
                                            bigram_dict_prefix + '.txt')
            with open(bigram_save_path, 'w', encoding='utf-8') as biFile:
                for line in bi_dict:
                    biFile.write(line)
                biFile.close()
            print("Total {} bigrams are saved!".format(len(bi_count)))
Esempio n. 5
0
    def save_dict(self, local_path: str = None):
        worddict = ''
        for key, count in self.uni_dict.items():
            worddict += '{} {}\n'.format(''.join(flat_hangeul(key)), count)

        if local_path:
            save_path = local_path
        else:
            save_path = self.path
        with open(save_path, 'w', encoding='utf-8') as file:
            for line in worddict:
                file.write(line)
            file.close()
Esempio n. 6
0
def noise_maker(word: str,
                threshold: float = 0.3,
                noise_char_num: int = 2,
                method: str = 'g'):
    """
    :param word: single word
    :param threshold: probability of generating noise
    :param noise_char_num: maximum number of noise character
    :param method: generation method, ("g": general, "ct": close typo, "d": delete character, "s": switch character)
    """
    noise_dict_key = {
        'g': noise_dict,
        'ct': typo_pattern,
        'd': null_dict,
    }
    if method not in ['g', 'ct', 'd', 's']:
        raise ValueError("specified generation method is not supported")

    assert threshold >= 0
    assert threshold <= 1

    text_tokenized = flat_hangeul(word)

    rand = random.random()
    if rand >= threshold:
        return text_tokenized, word
    else:
        sample_num_noise = random.choice(range(1, noise_char_num + 1))
        rand_idx = random.sample(range(len(text_tokenized)),
                                 min(len(text_tokenized), sample_num_noise))
        for idx in rand_idx:
            if method in ['g', 'ct', 'd']:
                n_dict = noise_dict_key[method]
                if text_tokenized[idx] in list(n_dict.keys()):
                    text_tokenized[idx] = random.choice(
                        n_dict[text_tokenized[idx]])
            elif method == 's':
                text_tokenized = switch_char(text_tokenized, idx)
        return text_tokenized, merge_flatted_hangeul(text_tokenized)
Esempio n. 7
0
 def text_to_token(text: str) -> list:
     return flat_hangeul(text)
Esempio n. 8
0
 def create(self, word: Text, count: int = 1000):
     word = ''.join(flat_hangeul(word))
     value = self.uni_dict.get(word)
     if not value:
         self.uni_dict[word] = count