Beispiel #1
0
    def __init__(
            self,
            # We support a single lang or a list of languages, one for each text
            lang_str_or_list,
            # A list of sentences in str format, but split by words either with our
            # default word delimiter DEFAULT_WORD_SPLITTER or space or whatever.
            # Or can also be a list of sentences in already split list format
            text_segmented_list
    ):
        self.lang = lang_str_or_list
        self.text_segmented_list = text_segmented_list

        self.lang_list = None
        if type(self.lang) in (list, tuple):
            self.lang_list = [LangFeatures.map_to_lang_code_iso639_1(lang_code=l) for l in self.lang]
            if len(self.lang_list) != len(self.text_segmented_list):
                raise Exception(
                    str(TextProcessor.__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Language list & text segmented list must have same length! '
                )
        else:
            self.lang = LangFeatures.map_to_lang_code_iso639_1(
                lang_code = self.lang
            )
            self.lang_list = [self.lang] * len(self.text_segmented_list)

        lg.Log.debugdebug(
            str(TextProcessor.__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Text segmented list: ' + str(self.text_segmented_list)
        )
        return
Beispiel #2
0
    def __init__(
            self
    ):
        self.lang_features = LangFeatures()

        # Map alphabet name to unicode character set array
        self.alphabet_dict = {}
        for alp in self.TESTS_BY_ORDER:
            self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet = alp
            )
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabets used: ' + str(self.alphabet_dict.keys())
        )

        self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator()
        Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep))

        # Load common words
        self.common_words = {}
        self.common_words[LangFeatures.LANG_EN] = English()
        self.common_words[LangFeatures.LANG_ES] = Spanish()
        self.common_words[LangFeatures.LANG_FR] = French()
        self.common_words[LangFeatures.LANG_ID] = Indonesian()
        self.common_words[LangFeatures.LANG_VI] = Vietnamese()

        # Load stemmers
        self.word_stemmer = {}
        for lang in self.SUPPORTED_LANGS:
            lang_have_verb_conj = self.lang_features.have_verb_conjugation(
                lang = lang
            )
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.'
            )
            self.word_stemmer[lang] = None
            if lang_have_verb_conj:
                try:
                    self.word_stemmer[lang] = Lemmatizer(
                        lang = lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.warning(errmsg)

        self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__))

        return
Beispiel #3
0
    def __init__(
            self,
            # List of documents or sentences, with preprocessing already done
            # (e.g. using nwae.lang.preprocessing.TxtPreprocessor)
            # Words however are not split into array yet, just separated by word separator specified
            # by language in nwae.lang.preprocessing.BasicPreprocessor
            docs,
            # List of labels
            labels,
            # If None we use space as word splitter
            langs = None
    ):
        self.docs = docs
        self.labels = labels
        self.langs = langs
        if self.langs is None:
            # Assume all English
            self.langs = [LangFeatures.LANG_EN] * len(self.docs)
        if (len(self.docs) != len(self.labels)) or (len(self.docs) != len(self.langs)):
            raise Exception(
                str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Length of docs ' + str(len(self.docs))
                + ' must equal labels shape ' + str(len(self.labels))
                + ' and langs shape ' + str(len(langs))
            )
        self.lang_features = LangFeatures()

        # We need to split the docs/sentences into a list of words
        self.docs_split = None
        return
Beispiel #4
0
    def __init__(
            self,
            lang,
            # This words list can be a full dictionary (for languages with natural space
            # as word separator) or just a common words list in our usage application context
            # for languages without a natural space as word separator.
            # This is because for languages without space, the word splitting itself might
            # be wrong, and the spelling correction algorithm might need to look at previous
            # or subsequent words.
            words_list,
            # Directory and identifier string for looking up EIDF files
            dir_path_model=None,
            identifier_string=None,
            # Option to pass in EIDF DataFrame instead of using directory and identifier string
            eidf_dataframe=None,
            do_profiling=False):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
        self.words_list = words_list
        self.dir_path_model = dir_path_model
        self.identifier_string = identifier_string
        self.eidf_dataframe = eidf_dataframe
        self.do_profiling = do_profiling

        self.sep_type = LangFeatures().get_word_separator_type(lang=lang)
        self.spell_check_word = SpellCheckWord(
            lang=self.lang,
            words_list=self.words_list,
            dir_path_model=self.dir_path_model,
            identifier_string=self.identifier_string,
            eidf_dataframe=self.eidf_dataframe,
            do_profiling=self.do_profiling)
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Initialize Spelling Correction for "' + str(lang) +
            '", separator type "' + str(self.sep_type) + '"')
        return
Beispiel #5
0
    def __init__(
            self,
            lang
    ):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(
            lang_code = lang
        )
        self.raw_words = None
        self.common_words = None

        lfobj = LangFeatures()
        self.lang_have_verb_conj = lfobj.have_verb_conjugation(
            lang = self.lang
        )
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.'
        )
        self.word_stemmer = None
        if self.lang_have_verb_conj:
            try:
                self.word_stemmer = Lemmatizer(
                    lang = self.lang
                )
                Log.important(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.'
                )
            except Exception as ex_stemmer:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                         + str(ex_stemmer) + '.'
                Log.warning(errmsg)
                self.word_stemmer = None

        return
Beispiel #6
0
    def __init__(
            self,
            lang,
            dirpath_synonymlist,
            postfix_synonymlist,
            add_latin_equiv_words = False
    ):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(
            lang_code = lang
        )

        self.dirpath_synonymlist = dirpath_synonymlist
        self.postfix_synonymlist = postfix_synonymlist
        self.add_latin_equiv_words = add_latin_equiv_words

        self.map_word_to_rootword = {}
        return
Beispiel #7
0
 def __init__(self, lang=LangFeatures.LANG_EN):
     self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
     Ssl.disable_ssl_check()
     try:
         if nltk.download(Corpora.NLTK_COMTRANS):
             Log.info(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': NLTK download of "' + Corpora.NLTK_COMTRANS + '" OK.')
         else:
             raise Exception('Download "' + str(Corpora.NLTK_COMTRANS) +
                             '" returned False')
     except Exception as ex:
         errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                  + ': NLTK download of "' + str(Corpora.NLTK_COMTRANS) + '" exception: ' \
                  + str(ex) + '.'
         Log.error(errmsg)
         raise Exception(errmsg)
     return
Beispiel #8
0
    def __init__(
            self,
            lang,
            # Список слов из словаря или любых
            words_list,
            # Directory and identifier string for looking up EIDF files
            dir_path_model=None,
            identifier_string=None,
            # Option to pass in EIDF DataFrame instead of using directory and identifier string
            eidf_dataframe=None,
            use_word_weighting=True,
            do_profiling=False):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
        self.words_list = words_list
        self.dir_path_model = dir_path_model
        self.identifier_string = identifier_string
        self.use_word_weighting = use_word_weighting
        self.eidf_dataframe = eidf_dataframe
        self.do_profiling = do_profiling

        self.trie = TrieNode.build_trie_node(words=self.words_list)
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Read ' +
            str(TrieNode.WORD_COUNT) + ' words, ' + str(TrieNode.NODE_COUNT) +
            ' trie nodes from wordlist ' + str(self.words_list[0:50]) +
            ' (first 50 of ' + str(len(self.words_list)) + ')')

        if not self.use_word_weighting:
            self.eidf_words = None
            self.eidf_value = None
        else:
            try:
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Initializing EIDF object.. try to read from file..')
                # Try to read from file
                df_eidf_file = eidf.Eidf.read_eidf_from_storage(
                    data_pd_dataframe=self.eidf_dataframe,
                    dir_path_model=self.dir_path_model,
                    identifier_string=self.identifier_string,
                    # No need to reorder the words in EIDF file
                    x_name=None)
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Successfully Read EIDF from file from directory "' +
                    str(self.dir_path_model) + '" for model "' +
                    str(self.identifier_string) + '".')
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': EIDF initialized as:' + str(df_eidf_file))
                self.eidf_words = np.array(
                    df_eidf_file[eidf.Eidf.STORAGE_COL_X_NAME], dtype=str)
                self.eidf_value = np.array(
                    df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF], dtype=float)
            except Exception as ex_eidf:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                         + ': No EIDF from file available. Exception ' + str(ex_eidf) + '.'
                Log.error(errmsg)
                raise Exception(errmsg)
        return
Beispiel #9
0
    def __init__(self,
                 model_name,
                 identifier_string,
                 dir_path_model,
                 lang,
                 dirpath_synonymlist,
                 postfix_synonymlist,
                 dir_wordlist,
                 postfix_wordlist,
                 dir_wordlist_app,
                 postfix_wordlist_app,
                 word_freq_model=FeatureVector.COL_FREQUENCY,
                 confidence_level_scores=None,
                 do_spelling_correction=False,
                 do_word_stemming=True,
                 do_profiling=False,
                 lang_additional=()):
        super(PredictClass, self).__init__()

        self.model_name = model_name
        self.identifier_string = identifier_string
        self.dir_path_model = dir_path_model

        self.lang_main = lang
        self.dirpath_synonymlist = dirpath_synonymlist
        self.postfix_synonymlist = postfix_synonymlist
        self.dir_wordlist = dir_wordlist
        self.postfix_wordlist = postfix_wordlist
        self.dir_wordlist_app = dir_wordlist_app
        self.postfix_wordlist_app = postfix_wordlist_app
        self.word_freq_model = word_freq_model
        self.do_spelling_correction = do_spelling_correction
        self.do_word_stemming = do_word_stemming
        self.do_profiling = do_profiling

        if lang_additional is None:
            lang_additional = ()
        self.lang_additional = [
            LangFeatures.map_to_lang_code_iso639_1(lang_code=l)
            for l in lang_additional
        ]
        try:
            self.lang_additional.remove(self.lang_main)
        except ValueError:
            pass
        self.lang_additional = list(set(self.lang_additional))

        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Predictor class initialization using model "' +
            str(self.identifier_string) + '", word freq model "' +
            str(self.word_freq_model) + '", main language "' +
            str(self.lang_main) + '", additional languages: ' +
            str(self.lang_additional))

        self.model = ModelHelper.get_model(
            model_name=self.model_name,
            # We will load the model params from file/etc from trained model
            model_params=None,
            identifier_string=self.identifier_string,
            dir_path_model=self.dir_path_model,
            training_data=None,
            confidence_level_scores=confidence_level_scores,
            do_profiling=self.do_profiling)
        self.model.start()
        # Keep track if model reloaded. This counter is manually updated by this class.
        self.model_last_reloaded_counter = 0
        self.load_text_processor_mutex = threading.Lock()

        # After loading model, we still need to load word lists, etc.
        self.is_all_initializations_done = False

        #
        # We initialize word segmenter and synonym list after the model is ready
        # because it requires the model features so that root words of synonym lists
        # are only from the model features
        #
        self.predict_class_txt_processor = None
        self.lang_detect = None

        self.count_predict_calls = 0

        # Wait for model to be ready to load synonym & word lists
        self.start()
        return
Beispiel #10
0
class LangDetect:

    SUPPORTED_LANGS = (
        LangFeatures.LANG_KO,
        LangFeatures.LANG_JA,
        LangFeatures.LANG_RU,
        LangFeatures.LANG_ZH,
        LangFeatures.LANG_TH,
        LangFeatures.LANG_EN,
        LangFeatures.LANG_ES,
        LangFeatures.LANG_FR,
        LangFeatures.LANG_VI,
        LangFeatures.LANG_ID,
    )

    THRESHOLD_PCT_WORDS_IN_MOST_COMMON = 0.15

    # We break text into these blocks
    TEXT_BLOCK_LEN = 10
    # Default covers 30% of blocks (e.g. if there are 10 blocks, we will randomly pick 3)
    DEFAULT_TEST_COVERAGE_PCT = 0.3
    # Not more than 5 blocks we will test to ensure speed
    DEFAULT_TEST_MAX_RANGE_BLOCKS = 5

    TEST_LATIN_BY_ORDER = [
        LangFeatures.ALPHABET_LATIN_AZ,
        # We also detect these special Vietnamese characters, to increase accuracy for Vietnamese
        LangFeatures.ALPHABET_LATIN_VI,
        # This Latin that covers all must be last to test
        LangFeatures.ALPHABET_LATIN
    ]
    TEST_CYRILLIC_BY_ORDER = [
        LangFeatures.ALPHABET_CYRILLIC
    ]
    TEST_HANGUL_BY_ORDER = [
        LangFeatures.ALPHABET_HANGUL
    ]
    TEST_JAPANESE_BY_ORDER = [
        # No point to test CJK
        LangFeatures.ALPHABET_HIRAGANA_KATAKANA,
    ]
    TEST_CJK_BY_ORDER = [
        LangFeatures.ALPHABET_CJK
    ]
    TEST_THAI_BY_ORDER = [
        LangFeatures.ALPHABET_THAI
    ]

    """
    Notes:
      - Need to test CJK first, then only Japanese that also contains CJK
    """
    TESTS_BY_ORDER = TEST_LATIN_BY_ORDER \
            + TEST_CYRILLIC_BY_ORDER \
            + TEST_HANGUL_BY_ORDER \
            + TEST_CJK_BY_ORDER \
            + TEST_JAPANESE_BY_ORDER \
            + TEST_THAI_BY_ORDER

    def __init__(
            self
    ):
        self.lang_features = LangFeatures()

        # Map alphabet name to unicode character set array
        self.alphabet_dict = {}
        for alp in self.TESTS_BY_ORDER:
            self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet = alp
            )
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabets used: ' + str(self.alphabet_dict.keys())
        )

        self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator()
        Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep))

        # Load common words
        self.common_words = {}
        self.common_words[LangFeatures.LANG_EN] = English()
        self.common_words[LangFeatures.LANG_ES] = Spanish()
        self.common_words[LangFeatures.LANG_FR] = French()
        self.common_words[LangFeatures.LANG_ID] = Indonesian()
        self.common_words[LangFeatures.LANG_VI] = Vietnamese()

        # Load stemmers
        self.word_stemmer = {}
        for lang in self.SUPPORTED_LANGS:
            lang_have_verb_conj = self.lang_features.have_verb_conjugation(
                lang = lang
            )
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.'
            )
            self.word_stemmer[lang] = None
            if lang_have_verb_conj:
                try:
                    self.word_stemmer[lang] = Lemmatizer(
                        lang = lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.warning(errmsg)

        self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__))

        return

    #
    # Only for languages with space as word separator
    # Or in the case of Vietnamese, it will split by syllables
    #
    def __segment_words(
            self,
            text
    ):
        sent = StringUtils.trim(text)
        sent = sent.lower()
        sent = sent.split(' ')
        # Split out punctuations
        sent = BasicPreprocessor.clean_punctuations(
            sentence = sent
        )
        return sent

    #
    # Описание Алгоритма
    #   1. Обнарушение Алфавитов
    #      i) Если приналежит языкам без пробела в качестве разбиение слов или слогов,
    #         это сразу определит тот язык.
    #      ii) Потом Латинские языки, сравнить обычные слова языка с данным текстом
    #
    def detect(
            self,
            text,
            test_coverage_pct = DEFAULT_TEST_COVERAGE_PCT,
            max_test_coverage_len = DEFAULT_TEST_MAX_RANGE_BLOCKS * TEXT_BLOCK_LEN,
            detailed = False
    ):
        det_start_time = Profiling.start()
        text = str(text)

        if len(text) == 0:
            return None

        #
        # First step
        #
        alps = self.__detect_alphabet_type(
            text   = text,
            test_coverage_pct = test_coverage_pct,
            max_test_coverage_len = max_test_coverage_len
        )

        # Either None type or empty dict
        if not alps:
            return None

        # Return value in this format ['hiragana_katakana', 'cjk']
        detected_top_alps = list(alps.keys())
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Top alphabets = ' + str(detected_top_alps)
        )

        """
        Первый специальный алгоритм - совсем "вручную" обрабатывает исключения из общих правил
        """
        pos_alphabets_manual = self.detect_via_manual_rules(detected_top_alphabet_names=detected_top_alps)
        if pos_alphabets_manual is not None:
            self.profiler_detect_alp.profile_time(
                start_time = det_start_time,
                additional_info = 'Manual detect lang "' + str(pos_alphabets_manual) + '" for "' + str(text) + '"'
            )
            return pos_alphabets_manual

        """
        Второй общий алгоритм - цикл по наиболее частым типам алфавита
        """
        # Loop by the top detected alphabet types
        loop_top_x = 2
        loop_counter = 0
        while loop_counter < loop_top_x:
            if len(detected_top_alps) > loop_counter:
                loop_alp = detected_top_alps[loop_counter]
            else:
                break
            loop_counter += 1
            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Loop ' + str(loop_counter) + ' alphabet "' + str(loop_alp) + '"'
            )

            # Get possible languages for this alphabet
            possible_langs_for_alphabet = self.lang_features.get_languages_for_alphabet_type(
                alphabet = loop_alp
            )
            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Possible languages for alphabet "' + str(loop_alp)
                + '": ' + str(possible_langs_for_alphabet)
            )

            # No dispute when only 1 possible language for given alphabet
            if len(possible_langs_for_alphabet) == 1:
                Log.debugdebug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Only 1 possible language for alphabet: ' + str(possible_langs_for_alphabet)
                )
                self.profiler_detect_alp.profile_time(
                    start_time = det_start_time,
                    additional_info = 'Detect lang "' + str(possible_langs_for_alphabet) + '" for "' + str(text) + '"'
                )
                return possible_langs_for_alphabet

            det_langs = []
            #
            # From alphabets detected, try to determine language
            #
            if loop_alp in self.TEST_HANGUL_BY_ORDER:
                det_langs = self.detect_lang_from_hangul(text=text, detailed=detailed)
            # Check Japanese first before CJK because CJK is a subset of Japanese
            elif loop_alp in self.TEST_JAPANESE_BY_ORDER:
                det_langs = self.detect_lang_from_japanese(text=text, detailed=detailed)
            elif loop_alp in self.TEST_CYRILLIC_BY_ORDER:
                det_langs = self.detect_lang_from_cyrillic(text=text, detailed=detailed)
            elif loop_alp in self.TEST_THAI_BY_ORDER:
                det_langs = self.detect_lang_from_thai_alphabet(text=text, detailed=detailed)
            #
            # Alphabet belongs to the Latin family
            #
            elif loop_alp in self.TEST_LATIN_BY_ORDER:
                # Almost all Latin Family languages will have LatinAZ come out tops first
                if loop_alp == LangFeatures.ALPHABET_LATIN_AZ:
                    det_langs = self.detect_lang_from_latin_az(
                        text = text,
                        detected_alphabets_present = detected_top_alps
                    )

                if not det_langs:
                    # We extend the search to all Latin if can't find anything
                    det_langs = self.detect_lang_from_latin(
                        text = text
                    )
            elif loop_alp == LangFeatures.ALPHABET_CJK:
                det_langs = self.detect_lang_from_cjk(text=text)

            # If have result, return the result and quit the loop
            if det_langs:
                self.profiler_detect_alp.profile_time(
                    start_time = det_start_time,
                    additional_info='Detect lang "' + str(det_langs) + '" for "' + str(text) + '"'
                )
                return det_langs

        self.profiler_detect_alp.profile_time(
            start_time = det_start_time,
            additional_info = 'Detect lang "' + str([]) + '" for "' + str(text) + '"'
        )
        return []

    def detect_via_manual_rules(
            self,
            detected_top_alphabet_names,
    ):
        # Don't change the original order of detected langs
        list_copy = detected_top_alphabet_names.copy()
        list_copy.sort()

        list_jap = [LangFeatures.ALPHABET_CJK, LangFeatures.ALPHABET_HIRAGANA_KATAKANA]
        list_jap.sort()
        if list_copy == list_jap:
            return [LangFeatures.LANG_JA]
        return None

    def detect_lang_from_hangul(
            self,
            text,
            detailed = False,
    ):
        if not detailed:
            return [LangFeatures.LANG_KO]
        else:
            raise Exception(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Not yet implemented'
            )

    def detect_lang_from_japanese(
            self,
            text,
            detailed = False,
    ):
        # TODO Handle the whole cyrillic family
        if not detailed:
            return [LangFeatures.LANG_JA]
        else:
            raise Exception(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Not yet implemented'
            )

    def detect_lang_from_cyrillic(
            self,
            text,
            detailed = False,
    ):
        # TODO Handle the whole cyrillic family
        if not detailed:
            return [LangFeatures.LANG_RU]
        else:
            raise Exception(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Not yet implemented'
            )

    def detect_lang_from_cjk(
            self,
            text,
            detailed = False,
    ):
        # TODO Differentiate Chinese (simplified, traditional, etc.), Japanese, ..
        if not detailed:
            return [LangFeatures.LANG_ZH]
        else:
            raise Exception(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Not yet implemented'
            )

    def detect_lang_from_thai_alphabet(
            self,
            text,
            detailed = False,
    ):
        # TODO Handle the different dialects
        if not detailed:
            return [LangFeatures.LANG_TH]
        else:
            raise Exception(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Not yet implemented'
            )

    def detect_lang_from_latin_az(
            self,
            text,
            detected_alphabets_present
    ):
        sent = self.__segment_words(text=text)

        lang_codes = []
        lang_pct = []

        for lang in (
                LangFeatures.LANG_EN, LangFeatures.LANG_ES, LangFeatures.LANG_FR,
                LangFeatures.LANG_VI, LangFeatures.LANG_ID,
        ):
            lang_codes.append(lang)
            max_word_n_tuple = 1
            if lang == LangFeatures.LANG_VI:
                max_word_n_tuple = 2
            lang_pct.append(self.common_words[lang].get_pct_intersection_with_common_words(
                word_list = sent,
                max_word_n_tuple = max_word_n_tuple
            ))

        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': For sentence ' + str(sent)
            + ' lang codes/pct: ' + str(pd.DataFrame({'code': lang_codes, 'pct': lang_pct}).values)
        )

        if lang_codes:
            idx_max = np.argmax(lang_pct)
            idx_max = int(idx_max)

            if lang_pct[idx_max] > self.THRESHOLD_PCT_WORDS_IN_MOST_COMMON:
                return [lang_codes[idx_max]]
            else:
                # Check word stems
                for lang in lang_codes:
                    if self.word_stemmer[lang] is None:
                        continue
                    sent_stems = []
                    for w in sent:
                        w_stem = self.word_stemmer[lang].stem(word=w)
                        sent_stems.append(w_stem)
                    if sent_stems == sent:
                        continue
                    Log.debug(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': For lang "' + str(lang)
                        + '", trying stemmed words: ' + str(sent_stems)
                    )
                    pct_int = self.common_words[lang].get_pct_intersection_with_common_words(
                        word_list = sent_stems
                    )
                    if pct_int > self.THRESHOLD_PCT_WORDS_IN_MOST_COMMON:
                        return [lang]

                # Although French, Spanish could also have these characters, we favor Vietnamese
                if LangFeatures.ALPHABET_LATIN_VI in detected_alphabets_present:
                    return [LangFeatures.LANG_VI]

        return []

    def detect_lang_from_latin(
            self,
            text
    ):
        # TODO This logic doesn't do anything
        sent = self.__segment_words(text=text)

        lang_codes = []
        lang_pct = []

        if lang_codes:
            idx_max = np.argmax(lang_pct)
            idx_max = int(idx_max)

            if lang_pct[idx_max] > self.THRESHOLD_PCT_WORDS_IN_MOST_COMMON:
                return [lang_codes[idx_max]]

        return []

    #
    # Returns tuple of start/end (not inclusive)
    # E.g. [(0,10), (10,20), ..]
    #
    def __get_text_range_blocks(
            self,
            text
    ):
        # Break into ranges
        range_blocks = []
        i = 0
        len_text = len(text)
        while i < len_text:
            end_range = min(len_text, i+self.TEXT_BLOCK_LEN)
            # range_blocks.append(range(i, end_range, 1))
            range_blocks.append((i,end_range))
            i = i + self.TEXT_BLOCK_LEN
        return range_blocks

    def __detect_alphabet_type(
            self,
            text,
            # default coverage
            test_coverage_pct,
            max_test_coverage_len
    ):
        alp_chars = []

        # Return the range blocks of the text
        range_blocks = self.__get_text_range_blocks(text = text)
        n_range = len(range_blocks)
        how_many_range_to_check = max(1, min(
            math.ceil(test_coverage_pct * n_range),
            math.ceil(max_test_coverage_len / self.TEXT_BLOCK_LEN)
        ))
        Log.debugdebug('Range blocks: ' + str(range_blocks) + ' how many to check ' + str(how_many_range_to_check))

        # Randomly pick the ranges
        random_ranges_index = random.sample(range(n_range), how_many_range_to_check)
        random_ranges_index = sorted(random_ranges_index)
        total_len = 0
        for rg in random_ranges_index:
            start, end = range_blocks[rg]
            total_len += (end - start + 1)

        # Means we got the last truncated block
        if total_len < self.TEXT_BLOCK_LEN:
            if 0 not in random_ranges_index:
                random_ranges_index = [0] + random_ranges_index

        text_excerps = []
        for rg in random_ranges_index:
            start, end = range_blocks[rg]
            text_excerps.append(text[start:end])

        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Random ranges index: ' + str(random_ranges_index) + ' or: ' + str(text_excerps)
        )

        # TODO
        #   How to check without looping, loop is slow
        #   One way is to build a reverse dictionary of all characters to alphabet name/type
        for rge_idx in random_ranges_index:
            #for i in range_blocks[rge_idx]:
            start, end = range_blocks[rge_idx]
            for i in range(start, end, 1):
                c = text[i]
                for alp in self.TESTS_BY_ORDER:
                    if c in self.alphabet_dict[alp]:
                        alp_chars.append(alp)
                        # Go to next character when found alphabet type
                        break

        if len(alp_chars) == 0:
            return None

        ser = pd.Series(alp_chars)
        vals, counts = np.unique(ser, return_counts=True)
        # We must mup count as key, so that when we sort the paired items later,
        # python will sort by the first index which is the count
        results = dict(zip(counts, vals))

        # Sort ascending
        results_list = sorted(results.items(), reverse=True)
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabet detection: ' + str(results_list) + ' details: ' + str(results)
        )

        # Reverse back the mapping
        results_rev = {kv[1]:kv[0] for kv in results_list}
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabet detection results: ' + str(results_rev)
        )
        return results_rev