Python LangFeatures.LangFeatures Examples

Programming Language: Python

Namespace/Package Name: nwae.lang.LangFeatures

Class/Type: LangFeatures

Method/Function: LangFeatures

Examples at hotexamples.com: 4

Python LangFeatures.LangFeatures - 4 examples found. These are the top rated real world Python examples of nwae.lang.LangFeatures.LangFeatures.LangFeatures extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

map_to_lang_code_iso639_1(7)

LangFeatures(4)

have_verb_conjugation(2)

get_languages_for_alphabet_type(1)

get_languages_with_no_word_separator(1)

Example #1

Show file

    def __init__(
            self,
            # List of documents or sentences, with preprocessing already done
            # (e.g. using nwae.lang.preprocessing.TxtPreprocessor)
            # Words however are not split into array yet, just separated by word separator specified
            # by language in nwae.lang.preprocessing.BasicPreprocessor
            docs,
            # List of labels
            labels,
            # If None we use space as word splitter
            langs = None
    ):
        self.docs = docs
        self.labels = labels
        self.langs = langs
        if self.langs is None:
            # Assume all English
            self.langs = [LangFeatures.LANG_EN] * len(self.docs)
        if (len(self.docs) != len(self.labels)) or (len(self.docs) != len(self.langs)):
            raise Exception(
                str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Length of docs ' + str(len(self.docs))
                + ' must equal labels shape ' + str(len(self.labels))
                + ' and langs shape ' + str(len(langs))
            )
        self.lang_features = LangFeatures()

        # We need to split the docs/sentences into a list of words
        self.docs_split = None
        return

Example #2

Show file

File: LangDetect.py Project: nwae/nwae

    def __init__(
            self
    ):
        self.lang_features = LangFeatures()

        # Map alphabet name to unicode character set array
        self.alphabet_dict = {}
        for alp in self.TESTS_BY_ORDER:
            self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet = alp
            )
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabets used: ' + str(self.alphabet_dict.keys())
        )

        self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator()
        Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep))

        # Load common words
        self.common_words = {}
        self.common_words[LangFeatures.LANG_EN] = English()
        self.common_words[LangFeatures.LANG_ES] = Spanish()
        self.common_words[LangFeatures.LANG_FR] = French()
        self.common_words[LangFeatures.LANG_ID] = Indonesian()
        self.common_words[LangFeatures.LANG_VI] = Vietnamese()

        # Load stemmers
        self.word_stemmer = {}
        for lang in self.SUPPORTED_LANGS:
            lang_have_verb_conj = self.lang_features.have_verb_conjugation(
                lang = lang
            )
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.'
            )
            self.word_stemmer[lang] = None
            if lang_have_verb_conj:
                try:
                    self.word_stemmer[lang] = Lemmatizer(
                        lang = lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.warning(errmsg)

        self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__))

        return

Example #3

Show file

    def __init__(
            self,
            lang,
            # This words list can be a full dictionary (for languages with natural space
            # as word separator) or just a common words list in our usage application context
            # for languages without a natural space as word separator.
            # This is because for languages without space, the word splitting itself might
            # be wrong, and the spelling correction algorithm might need to look at previous
            # or subsequent words.
            words_list,
            # Directory and identifier string for looking up EIDF files
            dir_path_model=None,
            identifier_string=None,
            # Option to pass in EIDF DataFrame instead of using directory and identifier string
            eidf_dataframe=None,
            do_profiling=False):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
        self.words_list = words_list
        self.dir_path_model = dir_path_model
        self.identifier_string = identifier_string
        self.eidf_dataframe = eidf_dataframe
        self.do_profiling = do_profiling

        self.sep_type = LangFeatures().get_word_separator_type(lang=lang)
        self.spell_check_word = SpellCheckWord(
            lang=self.lang,
            words_list=self.words_list,
            dir_path_model=self.dir_path_model,
            identifier_string=self.identifier_string,
            eidf_dataframe=self.eidf_dataframe,
            do_profiling=self.do_profiling)
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Initialize Spelling Correction for "' + str(lang) +
            '", separator type "' + str(self.sep_type) + '"')
        return

Example #4

Show file

File: CommonWords.py Project: nwae/nwae

    def __init__(
            self,
            lang
    ):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(
            lang_code = lang
        )
        self.raw_words = None
        self.common_words = None

        lfobj = LangFeatures()
        self.lang_have_verb_conj = lfobj.have_verb_conjugation(
            lang = self.lang
        )
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.'
        )
        self.word_stemmer = None
        if self.lang_have_verb_conj:
            try:
                self.word_stemmer = Lemmatizer(
                    lang = self.lang
                )
                Log.important(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.'
                )
            except Exception as ex_stemmer:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                         + str(ex_stemmer) + '.'
                Log.warning(errmsg)
                self.word_stemmer = None

        return