Ejemplo n.º 1
0
class Autocorrect:
    def __init__(self, words=None, max_edit_distance=2):
        self._symspell = SymSpell()
        self._max_edit_distance = max_edit_distance
        if words is not None:
            self.add_words(words)

    def add_word(self, word):
        if word is not None:
            self._symspell.create_dictionary_entry(word, 1)

    def add_words(self, words):
        if words is not None:
            self._symspell.create_dictionary(words)

    def delete_word(self, word):
        if word is not None:
            self._symspell.delete_dictionary_entry(word)

    def correct(self, bad_word):
        return self._symspell.lookup(bad_word,
                                     Verbosity.TOP,
                                     max_edit_distance=self._max_edit_distance,
                                     include_unknown=True)[0].term

    def predictions(self, bad_word):
        return self._symspell.lookup(bad_word,
                                     Verbosity.CLOSEST,
                                     max_edit_distance=self._max_edit_distance,
                                     include_unknown=True)
Ejemplo n.º 2
0
 def load(cls, language: str) -> "SpellCorrectGenerator":
     # maximum edit distance per dictionary pre-calculation
     max_edit_distance_dictionary = 2
     prefix_length = 7
     # create object
     sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
     if language == "en":
         dict_path = (pathlib.Path(__file__).parent / "resources" /
                      "frequency_dictionary_en_82_765.txt")
         sym_spell.create_dictionary(str(dict_path))
         spacy_model = spacy.load("en", disable=["parser", "ner"])
     else:
         raise RuntimeError(
             f"The language {language} is currently not language.")
     return cls(sym_spell, spacy_model)
Ejemplo n.º 3
0
def symspell_dict(max_edit_dist, prefix_len):
    dictfile = DICT_DIR / "big.txt"  #downloaded from Peter Norvig's site
    sym_spell = SymSpell(max_edit_dist, prefix_len)

    #create the symspell dictionary using the dictfile
    if not sym_spell.create_dictionary(str(dictfile)):
        print("corpus file not found")
    return sym_spell
Ejemplo n.º 4
0
    def _symspell(self, sentences):
        """
        SymSpell tool to spelling correction through Symmetric Delete spelling algorithm.

        Reference:
            Author: Wolf Garbe <*****@*****.**>
            Description: https://medium.com/@wolfgarbe/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f
            URL: https://github.com/wolfgarbe/symspell

            Python module: symspellpy (https://github.com/mammothb/symspellpy)
        """

        symspell = SymSpell(max_dictionary_edit_distance=self.N)
        symspell.create_dictionary(self.corpus_path)

        with open(self.dictionary_path, "w") as f:
            for key, count in symspell.words.items():
                f.write(f"{key} {count}\n")

        symspell.load_dictionary(self.dictionary_path,
                                 term_index=0,
                                 count_index=1)
        predicts = []

        if not isinstance(sentences, list):
            sentences = [sentences]

        for i in range(len(sentences)):
            split = []

            for x in sentences[i].split():
                sugg = symspell.lookup(
                    x.lower(),
                    verbosity=0,
                    max_edit_distance=self.N,
                    transfer_casing=True
                ) if x not in string.punctuation else None
                split.append(sugg[0].term if sugg else x)

            predicts.append(" ".join(split))

        return predicts
Ejemplo n.º 5
0
def create_symspell(max_edit_distance, prefix_length, freq_file_path):

    # create object
    sym_spell = SymSpell(max_edit_distance, prefix_length)

    # create dictionary using corpus.txt
    if not sym_spell.create_dictionary(freq_file_path):
        print("Corpus file not found")
        return None

    return sym_spell
Ejemplo n.º 6
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    # create dictionary using corpus.txt
    if not sym_spell.create_dictionary('D:/ML/QNA_project/corpus.txt'):
        print("Corpus file not found")
        return

    for key, count in sym_spell.words.items():
        print("{} {}".format(key, count))
Ejemplo n.º 7
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    # create dictionary using corpus.txt
    if not sym_spell.create_dictionary(
            "C:/Users/tyc64/Desktop/PythonStuff/REHS/Spellchecker_and_NER/corpus.txt"
    ):
        print("Corpus file not found")
        return

    for key, count in sym_spell.words.items():
        print("{} {}".format(key, count))
Ejemplo n.º 8
0
def main():
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    if not sym_spell.create_dictionary('training_data.txt',
                                       encoding="ISO-8859-1"):
        print("Corpus file not found")
        return
    dictlist = []
    for key, count in sym_spell.words.items():
        print("{} {}\n".format(key, count))
        dictlist.append("{} {}\n".format(key, count))
        # save Dictionary

    with open("dict.txt", "a+", encoding="ISO-8859-1") as text_file:
        text_file.write(str(dictlist))
    print('Saved Dic')
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname('./'), "dict.txt")
    print(dictionary_path)
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    # a sentence without any spaces
    data = ''
    with open('missing_spaces.txt', 'r', encoding="utf8") as myfile:
        data = myfile.read()

    splitline = data.split(',')
    #        for line in splitline:
    #            data.append(splitline[line])
    for indx in range(0, (len(splitline) - 1)):
        try:
            strval = splitline[indx]
            #        print(strval)
            result = sym_spell.word_segmentation(strval,
                                                 max_edit_distance_dictionary,
                                                 prefix_length)
            # display suggestion term, term frequency, and edit distance
            print("{}".format(result.corrected_string))
        except:
            print('out of index')
Ejemplo n.º 9
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    # create dictionary using corpus.txt
    if not sym_spell.create_dictionary(CORPUS_FILE):
        print("Corpus file not found")
        return

    f = open(DICT_FILE, "w+")
    for key, count in sym_spell.words.items():
        #print("{} {}".format(key, count))
        f.write("{} {} \r\n".format(key, count))
    f.close()
    print('dictionary file created')

    #create another dictionary file using corpus.txt
    sentence_list = []
    with open(CORPUS_FILE, 'r') as file:
        for line in file.readlines():
            line = re.sub('\n', '', line)
            sentence_list.append(line)

    corpus = ' '.join(sentence_list)

    word_count = Counter(corpus.split())

    df = pd.DataFrame({
        'word': list(word_count.keys()),
        'count': list(word_count.values())
    })

    df.loc[df['count'].isin(['2', '3', '4'])].sort_values(by='count').to_csv(
        WORD_COUNT_FILE, index=False)
    print('word count file created')
Ejemplo n.º 10
0
class SpellCheck:
    def __init__(self, progress, directory, countries_dict):
        self.progress = progress
        self.logger = logging.getLogger(__name__)
        self.spelling_update = Counter()
        self.directory = directory
        self.spell_path = os.path.join(self.directory, 'spelling.pkl')
        self.countries_dict = countries_dict
        self.sym_spell = SymSpell()

    def insert(self, name, iso):
        if 'gothland cemetery' not in name and name not in noise_words:
            name_tokens = name.split(' ')
            for word in name_tokens:
                key = f'{word}'
                if len(key) > 2:
                    self.spelling_update[key] += 1

    def write(self):
        # Create blank spelling dictionary
        path = os.path.join(self.directory, 'spelling.tmp')
        fl = open(path, 'w')
        fl.write('the,1\n')
        fl.close()
        success = self.sym_spell.create_dictionary(corpus=path)
        if not success:
            self.logger.error(f"error creating spelling dictionary")

        self.logger.info('Building Spelling Dictionary')

        # Add all words from geonames into spelling dictionary
        for key in self.spelling_update:
            self.sym_spell.create_dictionary_entry(
                key=key, count=self.spelling_update[key])

        self.logger.info('Writing Spelling Dictionary')
        self.sym_spell.save_pickle(self.spell_path)

    def read(self):
        success = False
        if os.path.exists(self.spell_path):
            self.logger.info(
                f'Loading Spelling Dictionary from {self.spell_path}')
            success = self.sym_spell.load_pickle(self.spell_path)
        else:
            self.logger.error(
                f"spelling dictionary not found: {self.spell_path}")

        if not success:
            self.logger.error(
                f"error loading spelling dictionary from {self.spell_path}")
        else:
            self.sym_spell.delete_dictionary_entry(key='gothland')

        size = len(self.sym_spell.words)
        self.logger.info(f"Spelling Dictionary contains {size} words")

    def lookup(self, input_term):
        #suggestions = [SymSpell.    SuggestItem]
        if '*' in input_term:
            return input_term
        res = ''
        if len(input_term) > 1:
            suggestions = self.sym_spell.lookup(input_term,
                                                Verbosity.CLOSEST,
                                                max_edit_distance=2,
                                                include_unknown=True)
            for idx, item in enumerate(suggestions):
                if idx > 3:
                    break
                #self.logger.debug(f'{item._term}')
                if item._term[0] == input_term[0]:
                    # Only accept results where first letter matches
                    res += item._term + ' '
            return res
        else:
            return input_term

    def lookup_compound(self, phrase):
        suggestions = self.sym_spell.lookup_compound(phrase=phrase,
                                                     max_edit_distance=2,
                                                     ignore_non_words=False)
        for item in suggestions:
            self.logger.debug(f'{item._term}')
        return suggestions[0]._term

    def fix_spelling(self, text):
        new_text = text
        if bool(re.search(r'\d', text)):
            # Has digits, just return text, no spellcheck
            pass
        elif 'st ' in text:
            # Spellcheck not handling St properly
            pass
        else:
            if len(text) > 0:
                new_text = self.lookup(text)
                self.logger.debug(f'Spell {text} -> {new_text}')

        return new_text.strip(' ')
Ejemplo n.º 11
0
class Fuzzy:
    ''' 
    This class defines the fuzzy joining tools and parameters for 
    string approximation. The primary toolkit for operations is
    the Symspell module and its associated Python port
    https://github.com/mammothb/symspellpy
    
    The input corpus file must be formatted as a record per row;
    all words on a single line are assumed to be part of a single space-
    separated string.
    
    Args:
        input_corpus: path to a text corpus containing the
            data to which query strings will be searched.
            Default is None, in which case a corpus can be
            loaded later
        preprocesser: an instance of the fuzzypanda.preprocess.PreProcessor
            class containing the 'preprocess' method used to pre-process the
            input strings. If set to None, will instantiate the default
            pre-processor. This option can be used to create a custom 
            pre-processor to pass to the get_fuzzy_columns function.
        max_edit_distance_dictionary: maximum edit distance to consider in
            SymSpell dictionary searches.
        prefix_length: length of the SymSpell dictionary prefix
    
    Attributes:
        corpus: path to the text corpus. If preprocessed, will point
            to the preprocessed file and unprocessed_corpus will
            point to the unprocessed file.
        preprocess_flag: Flag for indicating that preprocessing should
            be completed
        unprocessed_corpus: if pre-processing is requested, will point to
            the file containing the unprocessed input file
        sym_spell: the SymSpell object
        max_edit_distance_dictionary: maximum edit distance to consider in
            SymSpell dictionary searches.
        prefix_length: length of the SymSpell dictionary prefix
        
    '''
    def __init__(self,
                 input_corpus: str = None,
                 preprocesser=None,
                 max_edit_distance_dictionary: int = 2,
                 prefix_length: int = 7):
        # Set flags and initial variables
        self._preprocess_flag = False
        self.unprocessed_corpus = None
        self.sym_spell = None
        self.index_dictionary = None
        # Set inputs to attributes
        self.corpus = input_corpus
        self.max_edit_distance_dictionary = max_edit_distance_dictionary
        self.prefix_length = prefix_length
        # Setup the pre-processor object
        if preprocesser is None:
            self.preprocesser = preprocess.PreProcessor()
        else:
            self.preprocesser = preprocesser
        # Check the corpus and bootstrap preprocessing and Symspell
        self.check_corpus()
        if preprocess:
            self.preprocess_corpus()
        if self._preprocess_flag:
            self.create_symspell_dict()
            self.create_index()

    def is_preprocessed(self):
        '''
        Returns true if the corpus has been processed, false otherwise
        '''
        return (self._preprocess_flag)

    def check_corpus(self):
        '''
        Verifies that the corpus file exists
        '''
        logger.debug('Checking corpus file %s', self.corpus)
        if self.corpus is None:
            logger.warning('Corpus file not defined')
            return
        elif not os.path.exists(self.corpus):
            logger.error('Corpus file %s not found', self.corpus)
            raise FileNotFoundError(f'Corpus file {self.corpus} not found')
        else:
            logger.debug('Corpus file found')
            return

    def preprocess_corpus(self):
        '''
        Preprocesses the given corpus file in self.corpus. Will copy the
        processed results to 'process_[self.corpus]' file and change the
        self.corpus file to point to it.
        '''
        # Status checking
        logger.debug('Preprocessing corpus file %s', self.corpus)
        if self._preprocess_flag:
            logger.warning('Corpus already preprocessed! Skipping')
            return
        if self.corpus is None:
            logger.error('Attempted to pre-process undefined corpus file')
            raise FileNotFoundError('self.corpus must be specified before'
                                    ' pre-processing')
        self.check_corpus()
        # Creating filenames
        corpus_directory = os.path.dirname(self.corpus)
        corpus_name = os.path.basename(self.corpus)
        processed_corpus = os.path.join(corpus_directory,
                                        'preprocessed_' + corpus_name)
        # Pre-processing the input corpus strings
        with open(self.corpus, 'r') as cf:
            with open(processed_corpus, 'w') as pcf:
                for line in cf:
                    pcf.write(self.preprocesser.preprocess(line) + '\n')
        self.unprocessed_corpus = self.corpus
        self.corpus = processed_corpus
        self._preprocess_flag = True
        logger.debug('Corpus processed to %s', self.corpus)

    def create_symspell_dict(self):
        '''
        Creates the SymSpell dictionary object for later lookup. Required
        to lookup strings
        '''
        logger.debug('Creating SymSpell dictionary')
        self.check_corpus()
        # create SymSpell object
        try:
            self.sym_spell = SymSpell(self.max_edit_distance_dictionary,
                                      self.prefix_length)
        except Exception as ex:
            # in case an error occurs in SymSpell
            logger.exception('Failure to create SymSpell object!')
            raise ex
        # Create the dictionary for SymSpell
        self.sym_spell.create_dictionary(self.corpus)

    def create_index(self):
        '''
        The SymSpell dictionary will lookup strings closest to the preprocessed
        version of the query string. To convert back to the original string,
        an index dictionary is created to map back to the original string match.
        This function will create the in-memory dictionary used to lookup the
        original string from the pre-processed string
        The resulting index_dictionary will return strings such that
        index_dictionary[processed string] = unprocessed string
        '''
        logger.debug('Creating corpus index')
        # Status checking
        self.check_corpus()
        if self.index_dictionary is not None:
            logger.warning('index_dictionary already created. Overwritting.')
        if not self._preprocess_flag:
            logger.error(
                'Corpus %s not processed. Cannot create'
                ' index dictionary', self.corpus)
            raise FileNotFoundError(
                'Corpus not processed. '
                'Cannot create index dictionary', self.corpus)
        if not os.path.exists(self.unprocessed_corpus):
            logger.error('Unprocessed Corpus file %s'
                         'not found', self.unprocessed_corpus)
            raise FileNotFoundError('Unprocessed Corpus file'
                                    f'{self.unprocessed_corpus} not found')
        # Create pre-process index as dictionary
        self.index_dictionary = {}
        with open(self.unprocessed_corpus, 'r') as ucf:
            for line in ucf:
                # Create the index entries
                original_string = line.strip()
                processed_string = self.preprocesser.preprocess(
                    original_string)
                # Warn if the same string conflicts with an existing entry
                if processed_string in self.index_dictionary:
                    conflict_string = self.index_dictionary[processed_string]
                    # Don't flag if they are the same to begin with
                    if conflict_string != original_string:
                        logger.warning(
                            'index_dictionary conflict: '
                            '%s conflicts with %s for key '
                            '%s. Keeping index_dictionary[%s] = %s',
                            original_string, conflict_string, processed_string,
                            processed_string, conflict_string)
                        continue
                # if no conflict, add to index
                self.index_dictionary.update(
                    {processed_string: original_string})

        logger.debug('Corpus index created')

    def query(self, qstring: str):
        '''
        Queries an input string to the corpus, and retrieves 
        the closest value in the corpus by edit distance. 
        
        Args:
            qstring: string to query from the corpus
            
        Returns:
            (term, found_flag): Tuple of the suggested term and a flag
                of True if found in the corpus, or False if not. If not found
                within the corpus, returns the original qstring
        '''
        # Status checks
        # Check qstring
        qstring_type = type(qstring)
        if qstring_type is not type(''):
            msg = f'{qstring} is type {qstring_type} not string'
            logger.error(msg)
            raise ValueError(msg)
        # Check index_dictionary
        if self.index_dictionary is None:
            msg = 'index_dictionary not created'
            logger.error(msg)
            raise ValueError(msg)
        # Check sym_spell
        if self.sym_spell is None:
            msg = 'sym_spell SymSpell object not created'
            logger.error(msg)
            raise ValueError(msg)
        # pre-process the string
        processed_string = self.preprocesser.preprocess(qstring)
        logger.debug('Querying string: \'%s\', preprocessed to \'%s\' ',
                     qstring, processed_string)
        # Look up string using Symspell
        suggest = self.sym_spell.lookup(processed_string,
                                        Verbosity.TOP,
                                        include_unknown=True)
        found_term = suggest[0].term
        found_edit_distance = suggest[0].distance
        # Determine if string is a hit or miss and return result
        if found_edit_distance > self.max_edit_distance_dictionary:
            # indicates a failed lookup
            logger.debug('String \'%s\' not found!', qstring)
            return (qstring, False)
        else:
            # Found a term; backsolve and return found string
            backprocessed_string = self.index_dictionary[found_term]
            logger.debug('String \'%s\' found! Backprocessed \'%s\' to \'%s\'',
                         qstring, found_term, backprocessed_string)
            return (backprocessed_string, True)

    def get_fuzzy_column(self, dataframe, col_name, null_return=None):
        '''
        Given a Pandas dataframe and the name of a column, returns a new column
        with values taken from a fuzzy search of the underlying dictionary of
        names. 
        
        Args:
            dataframe (pandas.DataFrame): Input dataframe from which column
                values will be taken
            col_name (str): string of the column name used for searching values
            null_return (str): string to return if value is not found in the
                underlying dictionary. If None, will return the input string
                from the old column in the new column. Default is None.

        Returns:
            fuzzy_col (pandas.Series): Output pandas series of query results
            
        Raises:
            LookupError: if col_name is not in dataframe
            ValueError: if null_return is not a string type

        '''
        logger.debug('Creating fuzzy column for %s', col_name)
        # Input checking
        # Check col_name
        if col_name not in dataframe.columns:
            msg = [f'{col_name} not in dataframe columns:']
            for col in dataframe.columns:
                msg.append(col)
            logger.error(' '.join(msg))
            raise LookupError(' '.join(msg))
        # Check null_return
        str_type = type('')
        nr_type = type(null_return)
        if null_return is not None and nr_type is not str_type:
            msg = f'null_return is type {nr_type} not {str_type}'
            logger.error(msg)
            raise ValueError(msg)

        # Define a simple lookup function for serial df.apply
        def apply_query(value):
            (out, found) = self.query(value)
            if found:
                return out
            else:
                if null_return is None:
                    return out
                else:
                    return null_return

        fuzzy_col = dataframe[col_name].apply(apply_query)
        return fuzzy_col
Ejemplo n.º 12
0
output_dictionary_name = "response_validator_spelling_dictionary.txt"

# 1) Load up the original symspell dictionary and convert to pandas dataframe
sym_spell = SymSpell(3, 7)
sym_spell.load_dictionary(symspell_dictionary, 0, 1)
df_original = pd.DataFrame.from_dict(sym_spell.words,
                                     orient="index").reset_index()
df_original.columns = ["term", "count"]

# 2) Create a dictionary for each of the external datafiles. Append together
# and get total counts for each term
df_external = pd.DataFrame()
regexp = re.compile(r"[.!?\-\\+\[\]\#\$\%\^\&\*\(\)\@\d\']+")
for file in external_files:
    sym_spell = SymSpell(3, 7)
    sym_spell.create_dictionary(file)
    df_temp = pd.DataFrame.from_dict(sym_spell.words,
                                     orient="index").reset_index()
    df_temp.columns = ["term", "count"]
    df_temp = df_temp[df_temp["term"].apply(lambda x: not regexp.search(x))]
    df_external = df_external.append(df_temp)
df_external = df_external.groupby("term")["count"].sum().reset_index()

# 3) Adjust the counts in the original dataframe to be comparable to those in
# the external dataframe This avoids mangling the prior when doing Bayesian
# spelling correction
N_external = df_external["count"].sum()
N_original = df_original["count"].sum()
df_original["count"] = df_original["count"].apply(
    lambda x: int(np.ceil(x / (N_original / N_external))))
Ejemplo n.º 13
0
class SymDeletingTypoCorrecter(Module):
    def __init__(self, max_edit_dist: int = 2, prefix_length: int = 10):
        self.symspell = SymSpell(max_dictionary_edit_distance=max_edit_dist,
                                 prefix_length=prefix_length)
        self.max_edit_dist = max_edit_dist

    def train(self,
              corpus_path: str,
              save_path: str,
              unigram_dict_prefix: str,
              bigram_dict_prefix: str = None,
              **kwargs):
        self.symspell.create_dictionary(corpus_path)
        # 1) Unigram dict
        worddict = ''
        for key, count in self.symspell.words.items():
            worddict += '{} {}\n'.format(''.join(flat_hangeul(key)), count)

        unigram_save_path = os.path.join(save_path,
                                         unigram_dict_prefix + '.txt')
        with open(unigram_save_path, 'w', encoding='utf-8') as file:
            for line in worddict:
                file.write(line)
            file.close()
        print("Total {} Unigrams are saved!".format(
            len(self.symspell.words.items())))

        if bigram_dict_prefix:
            # 2) Bigram dict
            with open(corpus_path, 'r', encoding='utf-8') as file:
                corpus = file.readlines()
            corpus = [s.strip() for s in corpus]

            bi_count = self.count_bigrams(corpus, min_count=5)

            bi_dict = ''
            for key, count in bi_count.items():
                s1, s2 = key.split(' ')
                bi_dict += '{} {} {}\n'.format(''.join(flat_hangeul(s1)),
                                               ''.join(flat_hangeul(s2)),
                                               count)

            bigram_save_path = os.path.join(save_path,
                                            bigram_dict_prefix + '.txt')
            with open(bigram_save_path, 'w', encoding='utf-8') as biFile:
                for line in bi_dict:
                    biFile.write(line)
                biFile.close()
            print("Total {} bigrams are saved!".format(len(bi_count)))

    def load_model(self,
                   unigram_dict_path: str,
                   bigram_dict_path: str = None,
                   **kwargs):
        try:
            here = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
            default_path = os.path.join(here, "resources",
                                        'default_uni_dict.txt')

            self.symspell.load_dictionary(default_path,
                                          term_index=0,
                                          count_index=1)
            self.symspell.load_dictionary(unigram_dict_path,
                                          term_index=0,
                                          count_index=1)
        except ValueError:
            raise ValueError("Specified unigram dictionary path not exist")

        if bigram_dict_path:
            try:
                self.symspell.load_bigram_dictionary(unigram_dict_path,
                                                     term_index=0,
                                                     count_index=1)
            except ValueError:
                raise ValueError("Specified bigram dictionary path not exist")

    def infer(self, word: Text, **kwargs):
        suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
        suggestions = self.symspell.lookup(''.join(flat_hangeul(word)),
                                           suggestion_verbosity,
                                           self.max_edit_dist)
        if suggestions:
            word = list(suggestions[0].term)
            return merge_flatted_hangeul(word)
        return word

    @staticmethod
    def count_bigrams(corpus: list, min_count: int):
        bigrams = []
        for t in tqdm(corpus):
            if t.__class__ != str:
                continue
            else:
                text = t.split(' ')
                _bigrams = zip(*[text[i:] for i in range(2)])
                bigrams += [' '.join(s) for s in list(_bigrams)]

        count = Counter(bigrams)
        new_dict = {}
        for key, value in count.items():
            if value >= min_count:
                new_dict[key] = value
        return new_dict