Example #1
0
    def __init__(
            self,
            # List of documents or sentences, with preprocessing already done
            # (e.g. using nwae.lang.preprocessing.TxtPreprocessor)
            # Words however are not split into array yet, just separated by word separator specified
            # by language in nwae.lang.preprocessing.BasicPreprocessor
            docs,
            # List of labels
            labels,
            # If None we use space as word splitter
            langs = None
    ):
        self.docs = docs
        self.labels = labels
        self.langs = langs
        if self.langs is None:
            # Assume all English
            self.langs = [LangFeatures.LANG_EN] * len(self.docs)
        if (len(self.docs) != len(self.labels)) or (len(self.docs) != len(self.langs)):
            raise Exception(
                str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Length of docs ' + str(len(self.docs))
                + ' must equal labels shape ' + str(len(self.labels))
                + ' and langs shape ' + str(len(langs))
            )
        self.lang_features = LangFeatures()

        # We need to split the docs/sentences into a list of words
        self.docs_split = None
        return
Example #2
0
    def __init__(
            self
    ):
        self.lang_features = LangFeatures()

        # Map alphabet name to unicode character set array
        self.alphabet_dict = {}
        for alp in self.TESTS_BY_ORDER:
            self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet = alp
            )
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabets used: ' + str(self.alphabet_dict.keys())
        )

        self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator()
        Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep))

        # Load common words
        self.common_words = {}
        self.common_words[LangFeatures.LANG_EN] = English()
        self.common_words[LangFeatures.LANG_ES] = Spanish()
        self.common_words[LangFeatures.LANG_FR] = French()
        self.common_words[LangFeatures.LANG_ID] = Indonesian()
        self.common_words[LangFeatures.LANG_VI] = Vietnamese()

        # Load stemmers
        self.word_stemmer = {}
        for lang in self.SUPPORTED_LANGS:
            lang_have_verb_conj = self.lang_features.have_verb_conjugation(
                lang = lang
            )
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.'
            )
            self.word_stemmer[lang] = None
            if lang_have_verb_conj:
                try:
                    self.word_stemmer[lang] = Lemmatizer(
                        lang = lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.warning(errmsg)

        self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__))

        return
Example #3
0
    def __init__(
            self,
            lang,
            # This words list can be a full dictionary (for languages with natural space
            # as word separator) or just a common words list in our usage application context
            # for languages without a natural space as word separator.
            # This is because for languages without space, the word splitting itself might
            # be wrong, and the spelling correction algorithm might need to look at previous
            # or subsequent words.
            words_list,
            # Directory and identifier string for looking up EIDF files
            dir_path_model=None,
            identifier_string=None,
            # Option to pass in EIDF DataFrame instead of using directory and identifier string
            eidf_dataframe=None,
            do_profiling=False):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
        self.words_list = words_list
        self.dir_path_model = dir_path_model
        self.identifier_string = identifier_string
        self.eidf_dataframe = eidf_dataframe
        self.do_profiling = do_profiling

        self.sep_type = LangFeatures().get_word_separator_type(lang=lang)
        self.spell_check_word = SpellCheckWord(
            lang=self.lang,
            words_list=self.words_list,
            dir_path_model=self.dir_path_model,
            identifier_string=self.identifier_string,
            eidf_dataframe=self.eidf_dataframe,
            do_profiling=self.do_profiling)
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Initialize Spelling Correction for "' + str(lang) +
            '", separator type "' + str(self.sep_type) + '"')
        return
Example #4
0
    def __init__(
            self,
            lang
    ):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(
            lang_code = lang
        )
        self.raw_words = None
        self.common_words = None

        lfobj = LangFeatures()
        self.lang_have_verb_conj = lfobj.have_verb_conjugation(
            lang = self.lang
        )
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.'
        )
        self.word_stemmer = None
        if self.lang_have_verb_conj:
            try:
                self.word_stemmer = Lemmatizer(
                    lang = self.lang
                )
                Log.important(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.'
                )
            except Exception as ex_stemmer:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                         + str(ex_stemmer) + '.'
                Log.warning(errmsg)
                self.word_stemmer = None

        return