コード例 #1
0
ファイル: LangFeatures.py プロジェクト: nwae/nwae
    def __init__(
            self,
            write_lang_features_to_csv = False
    ):
        #
        # Language followed by flag for alphabet boundary, syllable boundary (either as one
        # character as in Chinese or space as in Korean), then word boundary (space)
        # The most NLP-inconvenient languages are those without word boundary, obviously.
        # Name, Code, Alphabet, CharacterType, SyllableSeparator, SyllableSeparatorType, WordSeparator, WordSeparatorType
        #
        # We need to define our own properties as even ISO 15924 specification does not contain them
        #
        # Hangul/CJK Language Family
        #
        try:
            self.PYCLANG = pycountry.languages
        except Exception as ex:
            Log.warning(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Cannot load pycountry languages: ' + str(ex)
            )
            self.PYCLANG = None

        lang_index = 0
        lang_ko = {
            self.C_LANG_ID:        self.LANG_KO,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Hangul',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_HANGUL,
            self.C_HAVE_SYL_SEP:   True,
            # TODO Not really right to say it is char but rather a "syllable_character"
            self.C_SYL_SEP_TYPE:   self.T_CHAR,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        #
        # CJK Alphabet Family
        #
        lang_index += 1
        lang_zh = {
            self.C_LANG_ID:        self.LANG_ZH,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Chinese',
            self.C_HAVE_ALPHABET:  False,
            self.C_CHAR_TYPE:      self.ALPHABET_CJK,
            self.C_HAVE_SYL_SEP:   True,
            self.C_SYL_SEP_TYPE:   self.T_CHAR,
            self.C_HAVE_WORD_SEP:  False,
            self.C_WORD_SEP_TYPE:  self.T_NONE,
            self.C_HAVE_VERB_CONJ: False
        }
        #
        # Japanese Hiragana/Katakana
        #
        lang_index += 1
        lang_ja = {
            self.C_LANG_ID:        self.LANG_JA,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Japanese',
            self.C_HAVE_ALPHABET:  False,
            self.C_CHAR_TYPE:      self.ALPHABET_JAPANESE,
            self.C_HAVE_SYL_SEP:   True,
            self.C_SYL_SEP_TYPE:   self.T_CHAR,
            self.C_HAVE_WORD_SEP:  False,
            self.C_WORD_SEP_TYPE:  self.T_NONE,
            self.C_HAVE_VERB_CONJ: True
        }
        #
        # Cyrillic Alphabet Family
        #
        lang_index += 1
        lang_ru = {
            self.C_LANG_ID:        self.LANG_RU,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Russian',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_CYRILLIC,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }

        #
        # Thai Alphabet Family
        #

        lang_index += 1
        lang_th = {
            self.C_LANG_ID:        self.LANG_TH,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Thai',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_THAI,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  False,
            self.C_WORD_SEP_TYPE:  self.T_NONE,
            self.C_HAVE_VERB_CONJ: False
        }
        #
        # Latin Alphabet Family
        #
        lang_index += 1
        lang_en = {
            self.C_LANG_ID:        self.LANG_EN,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'English',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN_AZ,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_es = {
            self.C_LANG_ID:        self.LANG_ES,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Spanish',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_fr = {
            self.C_LANG_ID:        self.LANG_FR,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'French',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_de = {
            self.C_LANG_ID:        self.LANG_DE,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'German',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_it = {
            self.C_LANG_ID:        self.LANG_IT,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Italian',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_nl = {
            self.C_LANG_ID:        self.LANG_NL,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Dutch',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }

        lang_index += 1
        lang_vi = {
            self.C_LANG_ID:        self.LANG_VI,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Vietnamese',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN_VI_AZ,
            self.C_HAVE_SYL_SEP:   True,
            self.C_SYL_SEP_TYPE:   self.T_SPACE,
            self.C_HAVE_WORD_SEP:  False,
            self.C_WORD_SEP_TYPE:  self.T_NONE,
            self.C_HAVE_VERB_CONJ: False
        }
        lang_index += 1
        lang_id = {
            self.C_LANG_ID:        self.LANG_ID,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Indonesian',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN_AZ,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }

        self.langs = {
            # Hangul/CJK
            self.LANG_KO: lang_ko,
            self.LANG_JA: lang_ja,
            # CJK
            self.LANG_ZH: lang_zh,
            # Cyrillic
            self.LANG_RU: lang_ru,
            # Thai
            self.LANG_TH: lang_th,
            # Latin
            self.LANG_EN: lang_en,
            self.LANG_ES: lang_es,
            self.LANG_FR: lang_fr,
            self.LANG_DE: lang_de,
            self.LANG_IT: lang_it,
            self.LANG_NL: lang_nl,
            self.LANG_VI: lang_vi,
            self.LANG_ID: lang_id,
        }
        assert lang_index+1 == len(self.langs)

        # Add ISO 639-2 definitions
        for lang in self.langs.keys():
            if self.PYCLANG is not None:
                lang_639 = self.PYCLANG.get(alpha_2=lang)
                self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_3] = lang_639.alpha_3
                self.langs[lang][LangFeatures.C_LANG_639_2_NAME]    = lang_639.name
                self.langs[lang][LangFeatures.C_LANG_639_2_SCOPE]   = lang_639.scope
                self.langs[lang][LangFeatures.C_LANG_639_2_TYPE]    = lang_639.type
                try:
                    self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = lang_639.alpha_2
                except Exception:
                    self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = ''
                try:
                    self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = lang_639.bibliographic
                except Exception:
                    self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = ''
            else:
                self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_3] = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_NAME]    = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_SCOPE]   = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_TYPE]    = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO]  = ''

        # Copy 2-letter keys (ISO 639-1) to also 3-letter keys (ISO 639-3)
        # Means we can access the language structure using either ISO 639-1 or ISO 639-3
        # If engineering standard ISO had been more far-sighted (after all 26*26=676 only)
        # we would not have to do this
        new_items = {}
        for key in self.langs.keys():
            lang_iso_699_3 = self.langs[key][LangFeatures.C_LANG_639_2_ALPHA_3]
            if key != lang_iso_699_3:
                lang_dict = self.langs[key].copy()
                # Change lang id to 3-letter ISO 639-1
                lang_dict[self.C_LANG_ID] = lang_iso_699_3
                new_items[lang_iso_699_3] = lang_dict
        for lang_id3 in new_items:
            self.langs[lang_id3] = new_items[lang_id3]

        self.langfeatures = pd.DataFrame(
            self.langs.values()
        )
        # Конечно более удобно хранить данные в csv файле..
        # но проблема с путем файла и тп будет очень неприятна пользователем
        if write_lang_features_to_csv:
            self.langfeatures = self.langfeatures.sort_values(by=[self.C_LANG_NAME], ascending=True)
            self.langfeatures.to_csv('lang_features.csv', sep=',', index=False)
        return
コード例 #2
0
ファイル: NwaePartsUnitTest.py プロジェクト: nwae/nwae
                     ', TOTAL FAIL = ' + str(res_final.count_fail))
        return res_final


if __name__ == '__main__':
    config = cf.NwaeConfig.get_cmdline_params_and_init_config_singleton(
        Derived_Class=cf.NwaeConfig,
        default_config_file=
        '/usr/local/git/nwae/nwae/app.data/config/default.cf')

    ut_params = uthelper.UnitTestParams(
        dirpath_wordlist=config.get_config(
            param=cf.NwaeConfig.PARAM_NLP_DIR_WORDLIST),
        postfix_wordlist=config.get_config(
            param=cf.NwaeConfig.PARAM_NLP_POSTFIX_WORDLIST),
        dirpath_app_wordlist=config.get_config(
            param=cf.NwaeConfig.PARAM_NLP_DIR_APP_WORDLIST),
        postfix_app_wordlist=config.get_config(
            param=cf.NwaeConfig.PARAM_NLP_POSTFIX_APP_WORDLIST),
        dirpath_synonymlist=config.get_config(
            param=cf.NwaeConfig.PARAM_NLP_DIR_SYNONYMLIST),
        postfix_synonymlist=config.get_config(
            param=cf.NwaeConfig.PARAM_NLP_POSTFIX_SYNONYMLIST),
        dirpath_model=config.get_config(param=cf.NwaeConfig.PARAM_MODEL_DIR))
    Log.important('Unit Test Params: ' + str(ut_params.to_string()))

    Log.LOGLEVEL = Log.LOG_LEVEL_ERROR

    res = NwaePartsUnitTest(ut_params=ut_params).run_unit_tests()
    exit(res.count_fail)
コード例 #3
0
ファイル: LangFeatures.py プロジェクト: nwae/nwae
# So we define our own list of properties
# However we also include properties from above standards using open PYPI packages like pycountry
#

import pandas as pd
from nwae.utils.Log import Log
from inspect import getframeinfo, currentframe
# pip install iso-639
# https://www.iso.org/iso-639-language-codes.html
# from iso639 import languages
import nwae.utils.UnitTest as ut
try:
    import pycountry
except Exception as ex:
    Log.warning(
        str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
        + ': Cannot import pycountry: ' + str(ex)
    )
    pass


#
# Class LangFeatures
#
#   Helper class to define language properties, such as containing word/syllable separators,
#   alphabet type, etc.
#
#   This most fundamental class for languages tells us:
#
#     1. Alphabet Type
#        What alphabet type a language is written in, either Latin, Cyrillic, etc.
#        This is used for example in LangDetect class.