Example #1
0
    def wait_for_all_initializations_to_be_done(self, wait_max_time=10):
        if self.is_all_initializations_done:
            return

        count = 1
        sleep_time_wait_initializations = 0.1
        while not self.is_all_initializations_done:
            Log.warning(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Model not yet fully initialized, sleep for ' +
                str(count * sleep_time_wait_initializations) + ' secs now..')
            if count * sleep_time_wait_initializations > wait_max_time:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Waited too long ' + str(count * sleep_time_wait_initializations)\
                         + ' secs. Raising exception..'
                raise Exception(errmsg)
            time.sleep(sleep_time_wait_initializations)
            count = count + 1
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Initializations all done for model "' +
            str(self.identifier_string) + '" READY.')
        return
Example #2
0
 def send(self, user, password, recipients_list, message):
     try:
         if password not in [None, '']:
             self.server.login(user=user, password=password)
             Log.important(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': Login for user "' + str(user) + '" successful.')
         else:
             # If no password passed in, no need to do login
             Log.warning(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': Not doing login for user "' + str(user) +
                 '", no password given "' + str(password) + '"')
         self.server.sendmail(from_addr=user,
                              to_addrs=recipients_list,
                              msg=message)
         Log.important(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': Message from ' +
             str(user) + ' to ' + str(recipients_list) +
             ' sent successfully. Closing server..')
         self.server.close()
         Log.info(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': Mail server "' +
             str(self.mail_server_url) + '" closed')
     except Exception as ex:
         errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                  + ': Exception sending mail from ' + str(user) + ' to ' + str(recipients_list)\
                  + '. Got exception ' + str(ex) + '.'
         Log.error(errmsg)
         raise Exception(errmsg)
Example #3
0
    def __init__(self, lang):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
        self.raw_words = None
        self.common_words = None

        lfobj = LangFeatures()
        self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang)
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Lang "' +
            str(self.lang) + '" verb conjugation = ' +
            str(self.lang_have_verb_conj) + '.')
        self.word_stemmer = None
        if self.lang_have_verb_conj:
            try:
                self.word_stemmer = Lemmatizer(lang=self.lang)
                Log.important(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': Lang "' +
                    str(self.lang) +
                    '" stemmer/lemmatizer initialized successfully.')
            except Exception as ex_stemmer:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                         + str(ex_stemmer) + '.'
                Log.warning(errmsg)
                self.word_stemmer = None

        return
Example #4
0
    def add_intent_name_to_training_data(self):
        #
        # We need to add intent name into the training data also
        #
        df_intent_id_name = pd.DataFrame({
            DaehuaTrainDataModel.COL_TDATA_INTENT_ID:
            self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_ID],
            DaehuaTrainDataModel.COL_TDATA_INTENT_NAME:
            self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_NAME]
        })
        # Make unique by dropping duplicate intent IDs
        df_intent_id_name.drop_duplicates(inplace=True)

        for idx in df_intent_id_name.index:
            intId = df_intent_id_name[
                DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx]
            try:
                int_name = str(df_intent_id_name[
                    DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx])

                # Arguments be a list form, otherwise will not be able to create this DataFrame
                row_to_append = pd.DataFrame(
                    data=self.__get_row_to_append_to_training_data(
                        intent_id=[intId],
                        intent_name=[int_name],
                        text=[int_name],
                        text_id=[TrDataPreprocessor.TRDATA_ID_INTENT_NAME],
                        # Make sure to write back this value with processed text
                        processed_text=[None],
                        lang_detected=[None],
                        internal_counter=[self.df_training_data.shape[0]]))

                #
                # We are appending to a dataframe that might have different columns ordering
                # So we make sure they are in the same order, to avoid all the sort=False/True
                # warning messages by pandas due to required join() operation.
                # If in same order, then we avoid the join().
                #
                self.df_training_data = self.df_training_data.append(
                    row_to_append, sort=True)
                Log.important(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Appended intent name "' + str(int_name) +
                    '" with intent ID ' + str(intId) +
                    ' to list of training data. Row appended = ' +
                    str(row_to_append))
            except Exception as ex:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                    + ': Could not append to dataframe or could not get intent name for intent ID ' \
                    + str(intId) + '. Exception ' + str(ex)
                Log.warning(errmsg)
                raise Exception(errmsg)

        self.__process_training_data_index()

        return self.df_training_data
Example #5
0
    def __init__(
            self
    ):
        self.lang_features = LangFeatures()

        # Map alphabet name to unicode character set array
        self.alphabet_dict = {}
        for alp in self.TESTS_BY_ORDER:
            self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet = alp
            )
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabets used: ' + str(self.alphabet_dict.keys())
        )

        self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator()
        Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep))

        # Load common words
        self.common_words = {}
        self.common_words[LangFeatures.LANG_EN] = English()
        self.common_words[LangFeatures.LANG_ES] = Spanish()
        self.common_words[LangFeatures.LANG_FR] = French()
        self.common_words[LangFeatures.LANG_ID] = Indonesian()
        self.common_words[LangFeatures.LANG_VI] = Vietnamese()

        # Load stemmers
        self.word_stemmer = {}
        for lang in self.SUPPORTED_LANGS:
            lang_have_verb_conj = self.lang_features.have_verb_conjugation(
                lang = lang
            )
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.'
            )
            self.word_stemmer[lang] = None
            if lang_have_verb_conj:
                try:
                    self.word_stemmer[lang] = Lemmatizer(
                        lang = lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.warning(errmsg)

        self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__))

        return
Example #6
0
    def transform_input_for_model(
            self,
            # For the model to interpret and transform in to x usable for model input
            # (e.g. map using one-hot dictionaries)
            x_input,
            word_freq_model = None,
    ):
        try:
            Log.debugdebug('***** x input: ' + str(x_input))
            # We expect x_input to be an np array of words
            if type(x_input) is np.ndarray:
                x_input = x_input.tolist()
            if type(x_input) not in (list, tuple):
                raise Exception(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Model "' + str(self.identifier_string)
                    + '". Expect list/tuple type, got type "' + str(type(x_input))
                    + '" for x input: ' + str(x_input)
                )
            if self.x_one_hot_dict_inverse is None:
                raise Exception(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Model "' + str(self.identifier_string) + '" x one hot not yet initialized!'
                )
            x = []

            for i in range(len(x_input)):
                word = x_input[i]
                if word in self.x_one_hot_dict_inverse.keys():
                    x.append(self.x_one_hot_dict_inverse[word])
                else:
                    Log.warning(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Model "' + str(self.identifier_string) + '", could not map input value "' + str(word)
                        + '" to code x. Not in x one hot dictionary.'
                    )

            # TODO Pad with 0's to satisfy neural network in put length
            input_shape = self.network.layers[0].input_shape
            input_len = input_shape[1]
            Log.debugdebug('***** INPUT SHAPE ' + str(input_shape) + ', len ' + str(input_len) + ', x = ' + str(x))
            while len(x) < input_len:
                x = [0] + x
            Log.debugdebug('  ***** padded x: ' + str(x))

            x = np.array(x)
            x_transformed = NumpyUtil.convert_dimension(arr=x, to_dim=2)
            Log.debugdebug('  ***** transformed x: ' + str(x_transformed))

            return x_transformed
        except Exception as ex:
            raise Exception(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Model "' + str(self.identifier_string) + '", exception tranforming ' + str(x_input)
                + '. Exception: ' + str(ex)
            )
Example #7
0
    def build_tree(self, dict_parent_childs):
        self.reset_tree()
        for parent_key in dict_parent_childs.keys():
            child_keys = dict_parent_childs[parent_key]
            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Doing for line ' + str(parent_key) + ': ' + str(child_keys)
            )
            if parent_key not in self.tree_nodes.keys():
                Log.debug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Create new parent ' + str(parent_key)
                )
                parent = MultiTreeNode(name=parent_key, dead_node=False)
                self.tree_nodes[parent_key] = parent
            else:
                parent = self.tree_nodes[parent_key]
                Log.debug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Retrieved parent ' + str(parent.name)
                )

            for child_k in child_keys:
                if child_k == parent_key:
                    Log.warning(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Child "' + str(child_k) + '" same as parent "' + str(parent_key) + '" Ignoring...'
                    )
                    continue
                Log.debug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Doing child ' + str(child_k) + ' for parent ' + str(parent_key)
                )
                if child_k not in self.tree_nodes.keys():
                    Log.debug(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Create new child ' + str(child_k)
                    )
                    child = MultiTreeNode(name=child_k, dead_node=False)
                    self.tree_nodes[child_k] = child
                else:
                    child = self.tree_nodes[child_k]
                    Log.debug(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Retrieved child ' + str(child.name)
                    )

                Log.debug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Child ' + str(child.name) + ' adding parent ' + str(parent.name)
                )
                child.add_parent(parent=parent)

        self.build_tree_roots()
        return self.tree_nodes
Example #8
0
 def __init__(self, noun_case_endings=NOUN_PARTICLES, verb_case_endings=()):
     super().__init__(noun_case_endings=noun_case_endings,
                      verb_case_endings=verb_case_endings)
     try:
         # Разбить Хангул (한글) слоги на буквы (자모)
         # https://github.com/JDongian/python-jamo, https://python-jamo.readthedocs.io/en/latest/
         from jamo import h2j, j2hcj
     except Exception as ex:
         errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                  + ': Error importing jamo library: ' + str(ex)
         Log.warning(errmsg)
         raise Exception(errmsg)
     return
Example #9
0
 def slice_str(x, maxlen):
     len_x = len(str(x))
     l = min(len_x, maxlen)
     if l < len_x:
         x_slice = str(x)[0:l]
         Log.warning(
             str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Cut from length ' + str(len_x) + ' to ' + str(maxlen)
             + ' characters. From "' + str(x) + '" to "' + str(x_slice) + '"'
         )
         return x_slice
     else:
         return x
Example #10
0
    def __send_email(self, text_subject, text_msg, files, ignore_limit):
        email_msg = SendMail.prepare_message(
            from_addr=self.from_addr,
            to_addrs_list=self.alert_recipients,
            subject=text_subject,
            text=text_msg,
            files=files)
        try:
            # Check how many already sent this hour
            if datetime.now().hour != self.current_hour:
                self.current_hour = datetime.now().hour
                self.emails_sent_this_hour = 0

            if not ignore_limit:
                if self.emails_sent_this_hour >= self.limit_per_hour:
                    Log.warning(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': Send email alert limit ' +
                        str(self.limit_per_hour) +
                        ' per hour hit. Not sending subject: "' +
                        str(text_subject) + '", message: ' + str(text_msg))
                    return
            else:
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Ignoring send limit of ' + str(self.limit_per_hour) +
                    ' per hour.')

            if self.fake_send:
                print('Fake send email from "' + str(self.from_addr) +
                      '" to: ' + str(self.alert_recipients) + ' Message:\n\r' +
                      str(email_msg))
            else:
                SendMail(mode=self.mail_mode,
                         mail_server_url=self.mail_server_url,
                         mail_server_port=self.mail_server_port).send(
                             user=self.from_addr,
                             password=self.password,
                             recipients_list=self.alert_recipients,
                             message=email_msg)
            self.emails_sent_this_hour += 1
        except Exception as ex_mail:
            Log.error(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Error sending email: ' + str(ex_mail) +
                '. Could not send message: ' + str(email_msg))
Example #11
0
 def print_tree(self, level, tnode, max_levels=8, newline='\n\r', tabchar='\t'):
     if level > max_levels:
         Log.warning(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Level too much > ' + str(max_levels) + ' for "' + str(tnode.name) + '"'
         )
         return ''
     tabstr = ''
     for i in range(level):
         tabstr += tabchar
     string_to_print = str(tabstr) + 'Level ' + str(level) + ': ' + str(tnode.name) + str(newline)
     for child in tnode.children:
         string_to_print += self.print_tree(
             level=level+1, tnode=child, max_levels=max_levels, newline=newline, tabchar=tabchar
         )
     return string_to_print
Example #12
0
 def wait_for_model(self):
     count = 1
     sleep_time_wait_rfv = 0.1
     wait_max_time = 10
     while not self.is_model_ready():
         Log.warning(
             str(__name__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Waiting for model with identifier "' +
             str(self.identifier_string) + ', sleep for ' +
             str(count * sleep_time_wait_rfv) + ' secs now..')
         if count * sleep_time_wait_rfv > wait_max_time:
             errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                      + ': Waited too long for model "' + str(self.identifier_string) \
                      + '" total wait time ' + str(count * sleep_time_wait_rfv) + ' secs. Raising exception..'
             raise Exception(errmsg)
         time.sleep(sleep_time_wait_rfv)
         count = count + 1
Example #13
0
 def is_higher_level(self, node, supposed_child_node):
     Log.debug(
         '***** check if "' + str(supposed_child_node.name) + '" is higher level than "'
         + str(node.name) + '", parents: ' + str(node.parent_names)
     )
     if supposed_child_node.name in node.parent_names:
         Log.warning(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Node "' + str(self.name) + '" cannot add "' + str(supposed_child_node.name)
             + '" as child. Node "' + str(supposed_child_node.name)
             + '" is already a higher level parent node to "' + str(self.name) + '"'
         )
         return True
     for par in node.parents:
         if self.is_higher_level(node=par, supposed_child_node=supposed_child_node):
             return True
         else:
             continue
     return False
Example #14
0
 def __sanity_check(
     self,
     sentences_list,
 ):
     for sent in sentences_list:
         if type(sent) not in (list, tuple):
             errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                      + ': Warning line ' + str(sent) + ', sentence not list type but type "'\
                      + str(type(sent)) + '": ' + str(sent)
             Log.warning(errmsg)
             raise Exception(errmsg)
         for j in range(len(sent)):
             w = sent[j]
             if type(w) is not str:
                 errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                          + ': Warning line ' + str(sent) + ', have non string type words "' \
                          + str(type(w)) + '": ' + str(w)
                 Log.warning(errmsg)
                 raise Exception(errmsg)
     return
Example #15
0
    def wait_for_model_to_be_ready(self, wait_max_time=10):
        #
        # Model reloaded without us knowing, e.g. user trained it, etc.
        #
        if self.model_last_reloaded_counter != self.model.get_model_reloaded_counter(
        ):
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                + 'Model "' + str(self.identifier_string) + '" last counter '
                + str(self.model_last_reloaded_counter) + ' not equal to model counter '
                + str(self.model.get_model_reloaded_counter())
                + '. Model updated, thus we must update our text processor.'
            )
            #
            # Должен опять загрузить потому что класс TxtPreprocessor нужны данные из модели
            #
            self.load_text_processor()

        if self.model.is_model_ready():
            return

        count = 1
        sleep_time_wait_model = 0.1
        while not self.model.is_model_ready():
            Log.warning(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Model "' +
                str(self.identifier_string) + '" not yet ready, sleep for ' +
                str(count * sleep_time_wait_model) + ' secs now..')
            if count * sleep_time_wait_model > wait_max_time:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Waited for model "' + str(self.identifier_string)\
                         + '" too long ' + str(count * sleep_time_wait_model) + ' secs. Raising exception..'
                raise Exception(errmsg)
            time.sleep(sleep_time_wait_model)
            count = count + 1
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Model "' +
            str(self.identifier_string) + '" READY.')
        return
Example #16
0
    def __attach_file_check_validity_and_size(
            files_attachment_list,
            max_total_files_size=MAX_TOTAL_FILES_SIZE_MB_EMAIL_ATTCH):
        if files_attachment_list is None:
            return []

        files_attachment_list_allowed = []

        cum_size_mb = 0.0
        for filepath in files_attachment_list:
            if os.path.isfile(filepath):
                Log.info('File <' + str(__name__) + '> line ' +
                         str(getframeinfo(currentframe()).lineno) +
                         ': Attachment file path "' + str(filepath) + '" OK')
            else:
                Log.error('File <' + str(__name__) + '> line ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': Invalid attachment file "' + str(filepath) +
                          '", not attaching to email')
                continue

            fsize_bytes = os.path.getsize(filepath)
            fsize_mb = round(fsize_bytes / (1024 * 1024), 2)

            if fsize_mb + cum_size_mb < max_total_files_size:
                files_attachment_list_allowed.append(filepath)
                cum_size_mb += fsize_mb
                Log.info('File <' + str(__name__) + '> line ' +
                         str(getframeinfo(currentframe()).lineno) +
                         ': Appended file "' + str(filepath) +
                         '" as email attachment size ' + str(fsize_mb) +
                         'MB, total cumulative ' + str(cum_size_mb) + 'MB')
            else:
                Log.warning('File <' + str(__name__) + '> line ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': File "' + str(filepath) + '" too big ' +
                            str(fsize_mb) + 'MB. Cumulative = ' +
                            str(fsize_mb + cum_size_mb) +
                            ' Not attaching to email')
        return files_attachment_list_allowed
Example #17
0
    def confirm_form(self, answer):
        answer = StringUtils.trim(answer)
        if answer.lower() in self.text_list_confirm_words:
            self.set_state_form_completed_and_confirmed()
            self.reset_continuous_error_count()
            return True
        else:
            # Try to update all fields strictly, maybe user wants to change something
            result = self.set_all_field_value_from_answer(answer=answer)
            if result.is_updated:
                self.reset_continuous_error_count()
            else:
                self.increment_continuous_error_count()

            if self.is_error_threshold_hit():
                Log.warning(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Reset form after ' +
                    str(self.fill_form_continuous_err_count) +
                    ' error counts.')
                self.reset()
            # No form confirmation
            return False
Example #18
0
    def preprocess_list_all_langs(
        self,
        sentences_list,
        # The output required may differ for different further processings
        # Some may require POS Tagging, some may require lemmatization, some may require to remove
        # stop words, etc
        algorithm=None,
    ):
        langs_list = self.detect_lang(sentences_list=sentences_list,
                                      method='nwae')
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Done detecting ' +
            str(len(sentences_list)) + ' sentence languages: ' +
            str(langs_list))
        # Get default lang as most common language detected
        langs_counter = collections.Counter(langs_list).most_common()
        self.lang_default = None
        for lang_count in langs_counter:
            if lang_count[0] != '':
                self.lang_default = lang_count[0]
                break
        # If still no default language (nothing detected at all from sentences passed in)
        if self.lang_default is None:
            self.lang_default = LangFeatures.LANG_EN
            langs_list = langs_list + [self.lang_default]
            Log.warning(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Unable to determine default language from langs ' +
                str(langs_counter) + ' Using default lang "' +
                str(self.lang_default) + '"')
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Most common language detected "' + str(self.lang_default) +
            '" from ' + str(langs_counter))
        unique_langs = [l for l in list(set(langs_list)) if l]
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Unique langs found: ' + str(unique_langs))
        for lang in unique_langs:
            Log.important(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Creating lang "' + str(lang) + '" word segmenter.')
            try:
                obj_tmp = TxtPreprocessor(
                    identifier_string=lang,
                    dir_path_model=None,
                    model_features_list=None,
                    lang=lang,
                    dir_wordlist=self.dir_wordlist,
                    postfix_wordlist=self.postfix_wordlist,
                    dir_wordlist_app=self.dir_app_wordlist,
                    postfix_wordlist_app=self.postfix_app_wordlist,
                    dirpath_synonymlist=self.dir_synlist,
                    postfix_synonymlist=self.postfix_synlist,
                    stopwords_list=self.stopwords_list,
                )
                self.txt_preprcsr_by_lang[lang] = obj_tmp
            except Exception as ex_load_txtprcsr:
                Log.error(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Failed to load text processor all lang for lang "' +
                    str(lang) + '": ' + str(ex_load_txtprcsr))
        sentences_list_processed = []
        for i in range(len(sentences_list)):
            sent = sentences_list[i]
            lang = langs_list[i]
            if lang not in self.txt_preprcsr_by_lang.keys():
                Log.debug(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': Lang "' +
                    str(lang) + '" not in keys ' +
                    str(self.txt_preprcsr_by_lang.keys()))
                lang = self.lang_default
            sent_processed = self.txt_preprcsr_by_lang[lang].process_text(
                inputtext=sent, )
            # commented out this part for now
            Log.debug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Preprocessed sentence "' + str(sent) + '" to "' +
                str(sent_processed) + '"')
            sentences_list_processed.append(sent_processed)

        return sentences_list_processed
Example #19
0
    def train(
        self,
        write_model_to_storage=True,
        write_training_data_to_storage=False,
        # Option to train a single y ID/label
        y_id=None,
        # To keep training logs here for caller's reference
        log_list_to_populate=None):
        prf_start = prf.Profiling.start()

        if self.training_data is None:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Cannot train without training data for identifier "' +
                self.identifier_string + '"')

        self.mutex_training.acquire()
        try:
            if type(log_list_to_populate) is list:
                self.logs_training = log_list_to_populate
            else:
                self.logs_training = []

            Log.important(str(self.__class__) + ' ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': Training for identifier=' +
                          self.identifier_string + ', y_id ' + str(y_id) +
                          '. Using key features remove quartile = ' +
                          str(self.key_features_remove_quartile) +
                          ', stop features = [' + str(self.stop_features) +
                          ']' + ', weigh by EIDF = ' + str(self.weigh_idf),
                          log_list=self.logs_training)

            #
            # Here training data must be prepared in the correct format already
            # Значит что множество свойств уже объединено как одно (unified features)
            #
            # Log.debugdebug(
            #     str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            #     + '\n\r\tTraining data:\n\r' + str(self.training_data.get_x().tolist())
            #     + '\n\r\tx names: ' + str(self.training_data.get_x_name())
            #     + '\n\r\ty labels: ' + str(self.training_data.get_y())
            # )

            #
            # Get IDF first
            # The function of these weights are nothing more than dimension reduction
            # TODO: IDF may not be the ideal weights, design an optimal one.
            #
            if self.weigh_idf:
                if MetricSpaceModel.USE_OPIMIZED_IDF:
                    try:
                        Log.info(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Initializing EIDF object.. try to read from file first',
                            log_list=self.logs_training)
                        # Try to read from file
                        df_eidf_file = eidf.Eidf.read_eidf_from_storage(
                            dir_path_model=self.dir_path_model,
                            identifier_string=self.identifier_string,
                            x_name=self.training_data.get_x_name())
                        Log.info(str(self.__class__) + ' ' +
                                 str(getframeinfo(currentframe()).lineno) +
                                 ': Successfully Read EIDF from file.',
                                 log_list=self.logs_training)
                        self.model_data.idf = np.array(
                            df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF])
                    except Exception as ex_eidf:
                        Log.critical(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': No EIDF from file available. Recalculating EIDF..',
                            log_list=self.logs_training)
                        idf_opt_obj = eidf.Eidf(
                            x=self.training_data.get_x(),
                            y=self.training_data.get_y(),
                            x_name=self.training_data.get_x_name())
                        idf_opt_obj.optimize(initial_w_as_standard_idf=True)
                        self.model_data.idf = idf_opt_obj.get_w()
                else:
                    # Sum x by class
                    self.model_data.idf = eidf.Eidf.get_feature_weight_idf_default(
                        x=self.training_data.get_x(),
                        y=self.training_data.get_y(),
                        x_name=self.training_data.get_x_name())
            else:
                self.model_data.idf = np.array(
                    [1.0] * self.training_data.get_x_name().shape[0],
                    dtype=float)

            # Standardize to at least 2-dimensional, easier when weighting x
            self.model_data.idf = npUtil.NumpyUtil.convert_dimension(
                arr=self.model_data.idf, to_dim=2)

            Log.debugdebug(str(self.__class__) + ' ' +
                           str(getframeinfo(currentframe()).lineno) +
                           '\n\r\tEIDF values:\n\r' + str(self.model_data.idf),
                           log_list=self.logs_training)

            #
            # Re-weigh again. This will change the x in self.training data
            #
            self.training_data.weigh_x(w=self.model_data.idf[0])

            #
            # Initizalize model data
            #
            # Refetch again after weigh
            x = self.training_data.get_x()
            y = self.training_data.get_y()
            self.model_data.x_name = self.training_data.get_x_name()

            # Unique y or classes
            # We do this again because after weighing, it will remove bad rows, which might cause some y
            # to disappear
            self.model_data.y_unique = np.array(list(set(y)))

            Log.debugdebug(str(self.__class__) + ' ' +
                           str(getframeinfo(currentframe()).lineno) +
                           '\n\r\tx weighted by idf and renormalized:\n\r' +
                           str(x.tolist()) + '\n\r\ty\n\r' + str(y) +
                           '\n\r\tx_name\n\r' + str(self.model_data.x_name),
                           log_list=self.logs_training)

            #
            # Get RFV for every command/intent, representative feature vectors by command type
            #

            # 1. Cluster training data of the same intent.
            #    Instead of a single RFV to represent a single intent, we should have multiple.
            xy_clstr = MetricSpaceModel.get_clusters(
                x=x,
                y=y,
                x_name=self.model_data.x_name,
                log_training=self.logs_training)
            self.model_data.x_clustered = xy_clstr.x_cluster
            self.model_data.y_clustered = xy_clstr.y_cluster
            self.model_data.y_clustered_radius = xy_clstr.y_cluster_radius

            #
            # RFV Derivation
            #
            m = np.zeros(
                (len(self.model_data.y_unique), len(self.model_data.x_name)))
            # Temporary only this data frame
            df_x_ref = pd.DataFrame(m,
                                    columns=self.model_data.x_name,
                                    index=list(self.model_data.y_unique))
            #print('***** y unique type: ' + str(type(self.model_data.y_unique)) + ', df_x_ref: '  + str(df_x_ref))
            self.model_data.df_y_ref_radius = pd.DataFrame(
                {
                    MetricSpaceModel.TERM_CLASS:
                    list(self.model_data.y_unique),
                    MetricSpaceModel.TERM_RADIUS:
                    [MetricSpaceModel.HPS_MAX_EUCL_DIST] *
                    len(self.model_data.y_unique),
                },
                index=list(self.model_data.y_unique))
            #print('***** df_x_ref: '  + str(self.model_data.df_y_ref_radius))

            #
            # Derive x_ref and y_ref
            #
            for cs in self.model_data.y_unique:
                Log.debug(str(self.__class__) + ' ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': Doing class [' + str(cs) + ']',
                          log_list=self.logs_training)
                # Extract class points
                class_points = x[y == cs]
                #
                # Reference feature vector for the command is the average of all feature vectors
                #
                rfv = np.sum(class_points, axis=0) / class_points.shape[0]
                # Renormalize it again
                # At this point we don't have to check if it is a 0 vector, etc. as it was already done in TrainingDataModel
                # after weighing process
                normalize_factor = np.sum(np.multiply(rfv, rfv))**0.5
                if normalize_factor < const.Constants.SMALL_VALUE:
                    raise Exception(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': Normalize factor for rfv in class "' + str(cs) +
                        '" is 0.')
                rfv = rfv / normalize_factor
                # A single array will be created as a column dataframe, thus we have to name the index and not columns
                df_x_ref.at[cs] = rfv

                check_normalized = np.sum(np.multiply(rfv, rfv))**0.5
                if abs(check_normalized - 1) > const.Constants.SMALL_VALUE:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                             + ': Warning! RFV for class [' + str(cs) + '] not 1, but [' + str(check_normalized) + '].'
                    Log.warning(errmsg, log_list=self.training_data)
                    raise Exception(errmsg)
                else:
                    Log.debug(str(self.__class__) + ' ' +
                              str(getframeinfo(currentframe()).lineno) +
                              ': Check RFV class "' + str(cs) +
                              '" normalized ok [' + str(check_normalized) +
                              '].',
                              log_list=self.logs_training)

                #
                # Get furthest point of classification to rfv
                # This will be used to accept or reject a classified point to a particular class,
                # once the nearest class is found (in which no class is found then).
                #
                # Minimum value of threshold, don't allow 0's
                radius_max = -1
                for i in range(0, class_points.shape[0], 1):
                    p = class_points[i]
                    dist_vec = rfv - p
                    dist = np.sum(np.multiply(dist_vec, dist_vec))**0.5
                    Log.debugdebug(str(self.__class__) + ' ' +
                                   str(getframeinfo(currentframe()).lineno) +
                                   '   Class ' + str(cs) + ' check point ' +
                                   str(i) + ', distance= ' + str(dist) +
                                   '. Point ' + str(class_points[i]) +
                                   ' with RFV ' + str(rfv),
                                   log_list=self.logs_training)
                    if dist > radius_max:
                        radius_max = dist
                        self.model_data.df_y_ref_radius[
                            MetricSpaceModel.TERM_RADIUS].at[cs] = dist

                Log.debug(str(self.__class__) + ' ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': Class "' + str(cs) + '". Max Radius = ' +
                          str(self.model_data.df_y_ref_radius[
                              MetricSpaceModel.TERM_RADIUS].loc[cs]),
                          log_list=self.logs_training)
            df_x_ref.sort_index(inplace=True)
            self.model_data.y_ref = np.array(df_x_ref.index)
            self.model_data.x_ref = np.array(df_x_ref.values)
            Log.debug('**************** ' + str(self.model_data.y_ref))

            if self.do_profiling:
                Log.important(str(self.__class__) +
                              str(getframeinfo(currentframe()).lineno) +
                              ' PROFILING train(): ' +
                              prf.Profiling.get_time_dif_str(
                                  prf_start, prf.Profiling.stop()),
                              log_list=self.logs_training)

            if write_model_to_storage:
                self.persist_model_to_storage()
            if write_training_data_to_storage or (self.is_partial_training):
                self.persist_training_data_to_storage(td=self.training_data)
        except Exception as ex:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Training exception for identifier "' + str(self.identifier_string) + '".'\
                     + ' Exception message ' + str(ex) + '.'
            Log.error(errmsg)
            raise ex
        finally:
            self.mutex_training.release()

        return self.logs_training
Example #20
0
# -*- coding: utf-8 -*-

from nwae.utils.Log import Log
from inspect import getframeinfo, currentframe
import nwae.lang.LangFeatures as lf
import nwae.utils.UnitTest as ut
try:
    import hanzidentifier as hz
except Exception as ex:
    Log.warning(
        str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
        ': Cannot import hanzidentifier: ' + str(ex))
    pass

#
# Class LangCharacters:
#   This class lays the fundamentals for dealing with characters & strings of multiple languages.
#   We define Unicode blocks for the relevant language characters, including punctuations, etc.
#   Every alphabet or character has a Unicode value (max value is 2^32)
#
#   But when required to store as a string variable, it has to undergo a transformation into say
#   UTF-8. This is purely for compression so we don't store each character as 4 bytes.
#   chr() converts a Unicode value to a Unicode string, e.g. the Unicode value 0x9a6c or 39532
#   is converted to '马' (either stored as UTF-8 or some encoding).
#
#   Another difference with R is that in Python, we always need to convert strings to Unicode form
#   for the above functions to work. In R this is handled transparently.
#
#   The Python function ord() does the opposite, converts '马' back to it's integer Unicode value.
#
# Supports:
Example #21
0
    def add_latin_form_to_training_data(self):
        #
        # We only support this complication if the main language has a Latin Equivalent Form
        # We ignore if it is only an additional language, to reduce complexity
        #
        if not latinEqForm.LatinEquivalentForm.have_latin_equivalent_form(
                lang=self.language_main):
            Log.important(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': For "' +
                str(self.model_identifier) + '", language "' +
                str(self.language_main) +
                '", nothing to do for latin equivalent form.')
            return

        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': For "' +
            str(self.model_identifier) + '", language "' +
            str(self.language_main) +
            '", adding to training data, the latin equivalent form.')
        for idx in self.df_training_data.index:
            text = str(self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TEXT].loc[idx])
            text_processed = str(self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].loc[idx])
            internal_counter = self.df_training_data[
                TrDataPreprocessor.TD_INTERNAL_COUNTER].loc[idx]
            #
            # Process the sentence, word by word
            #
            word_sep = BasicPreprocessor.get_word_separator(
                lang=self.language_main)
            latin_form_sentence_arr = []
            for word in text_processed.split(sep=word_sep):
                word_latin = latinEqForm.LatinEquivalentForm.get_latin_equivalent_form(
                    lang=self.language_main, word=word)
                latin_form_sentence_arr.append(word_latin)
            latin_form_sentence_txt = word_sep.join(latin_form_sentence_arr)
            if latin_form_sentence_txt == text_processed:
                continue

            Log.important(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Processing latin equivalent form "' +
                str(latin_form_sentence_txt) + '" for sentence "' +
                str(text_processed) + '".')
            int_id = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx]
            int_name = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx]
            row_to_append = None
            try:
                # Arguments be a list form, otherwise will not be able to create this DataFrame
                row_to_append = pd.DataFrame(
                    data=self.__get_row_to_append_to_training_data(
                        intent_id=[int_id],
                        intent_name=[int_name],
                        text=[text],
                        text_id=[TrDataPreprocessor.TRDATA_ID_LATIN_FORM],
                        processed_text=[latin_form_sentence_txt],
                        lang_detected=[self.language_main],
                        internal_counter=[internal_counter]))
                #
                # We are appending to a dataframe that might have different columns ordering
                # So we make sure they are in the same order, to avoid all the sort=False/True
                # warning messages by pandas due to required join() operation.
                # If in same order, then we avoid the join().
                #
                self.df_training_data = self.df_training_data.append(
                    row_to_append, sort=True)
                Log.important(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Appended latin equivalent form "' +
                    str(latin_form_sentence_txt) + '" with intent ID ' +
                    str(int_id) +
                    ' to list of training data. Row appended = ' +
                    str(row_to_append))
            except Exception as ex:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                    + ': Could not append row ' + str(row_to_append) + ' to dataframe for intent ID ' \
                    + str(int_id) + '. Exception ' + str(ex)
                Log.warning(errmsg)
                raise Exception(errmsg)
        self.__process_training_data_index()
        return
Example #22
0
    def process_text_training_data(self, ):
        # The algorithm to segment words works as follows:
        #   If segmented text returned from DB is None or shorter than text, we will process the text.
        #   However if the flag self.reprocess_all_text == True, we segment no matter what.

        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': START SEGMENT & STEM DB TRAINING DATA, FORCE RESEGMENT ALL = ' +
            str(self.reprocess_all_text))

        td_total_rows = self.df_training_data.shape[0]
        count = 0

        for idx_row in self.df_training_data.index:
            count = count + 1
            text_from_db = str(self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TEXT].loc[idx_row])
            text_processed_from_db = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].loc[idx_row]
            intent_td_id = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TRAINING_DATA_ID].loc[idx_row]
            intent_id = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx_row]
            intent_name = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx_row]
            # Internal Counter
            internal_counter = self.df_training_data[
                TrDataPreprocessor.TD_INTERNAL_COUNTER].loc[idx_row]

            Log.debugdebug('Processing index row "' + str(idx_row) + '" ' +
                           str(self.df_training_data.loc[idx_row]) + '"')

            if type(text_from_db) is not str:
                Log.warning(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Text from DB "' + str(text_from_db) +
                    '" not string type.')
                text_from_db = str(text_from_db)
            # When a text is updated in DB/storage, this field should be cleared in DB to NULL
            if text_processed_from_db is None:
                text_processed_from_db = ''

            possible_langs = self.lang_detect.detect(text=text_from_db)
            # Empty list
            if not possible_langs:
                lang_detected = self.language_main
            else:
                lang_detected = possible_langs[0]

            # If detected language not supported
            if lang_detected not in [self.language_main
                                     ] + self.languages_additional:
                Log.warning(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': For "' +
                    str(self.model_identifier) + '", detected lang "' +
                    str(lang_detected) + '" not in languages supported')
                lang_detected = self.language_main
            # Update data frame with language detected
            self.df_training_data[DaehuaTrainDataModel.COL_TDATA_TEXT_LANG].at[idx_row] = \
                lang_detected

            #if lang_detected != self.language_main:
            Log.info(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Lang "' +
                str(lang_detected) + '" main lang "' +
                str(self.language_main) + '" for text "' + str(text_from_db) +
                '".')

            #
            # Sanity check only. Should not happen since after every training data update,
            # NULL would be written back to the TextSegmented column.
            # Because we don't want to reprocess all text which takes time, so we guess first
            #
            is_likely_processed_text_changed = len(
                text_processed_from_db) < len(text_from_db)
            # If a language has verb conjugation, we cannot just compare length as the original text could be longer
            if self.lang_have_verb_conj[lang_detected]:
                # So we just hardcode
                is_likely_processed_text_changed = len(
                    text_processed_from_db) <= 8

            if is_likely_processed_text_changed:
                if (intent_td_id is not None) and (intent_td_id > 0):
                    # Warn only if it is not our own inserted data
                    Log.warning(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) + ': Text "' +
                        str(text_from_db) +
                        '" likely has incorrect segmentation "' +
                        str(text_processed_from_db) + '".')

            #
            # We only reprocess the text if there is some likelihood of change
            #
            if self.reprocess_all_text or is_likely_processed_text_changed:
                processed_text_str = self.txt_preprocessor[
                    lang_detected].process_text(inputtext=text_from_db,
                                                return_as_string=True)
                Log.debug(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': Text "' +
                    str(text_from_db) + '" processed text "' +
                    str(processed_text_str) + '".')

                is_text_processed_changed = not (text_processed_from_db
                                                 == processed_text_str)
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': No ' +
                    str(count) + ' of ' + str(td_total_rows) +
                    ': Tr Data ID "' + str(intent_td_id) +
                    '". Force segment = ' + str(self.reprocess_all_text) +
                    '\n\r   Text "' + str(text_from_db) + '". Processed to "' +
                    str(processed_text_str) + '"' + ', changed = ' +
                    str(is_text_processed_changed))

                # Training ID 0 are those we inserted ourselves so no need to update anything
                if is_text_processed_changed:
                    # Update the column
                    self.df_training_data[DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].at[idx_row] = \
                        processed_text_str

                    # For intent name we inserted, no need to warn
                    if (intent_td_id is not None) and (intent_td_id > 0):
                        Log.warning(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Processed text different. Text "' +
                            str(text_from_db) + '\n\r   new processed text "' +
                            str(processed_text_str) + '"' +
                            '\n\r   old processed text "' +
                            str(text_processed_from_db) + '"')

                        row_changed = self.__get_row_to_append_to_training_data(
                            intent_id=intent_id,
                            intent_name=intent_name,
                            text=text_from_db,
                            text_id=intent_td_id,
                            processed_text=processed_text_str,
                            lang_detected=lang_detected,
                            internal_counter=internal_counter)
                        self.list_of_rows_with_changed_processed_text.append(
                            row_changed)
                        Log.important(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Appended changed row: ' + str(row_changed))
                    else:
                        Log.important(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Processed text ' + str(count) + ' ok "' +
                            str(processed_text_str) + '" from "' +
                            str(text_from_db) + '"')
            else:
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Training data ID ' + str(intent_td_id) + ': No ' +
                    str(count) + ' of ' + str(td_total_rows) +
                    ': Nothing to do, OK segmented/processed from DB "' +
                    str(text_processed_from_db) + '"')
        return
Example #23
0
 def warn_korean(self):
     Log.warning(
         str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
         + ': Korean splitting currently uses kkma which is super slow and unusable for production purposes'
     )
Example #24
0
    def preprocess_training_data_text(self):
        # Just add intent names into the training data, no text processing
        self.add_intent_name_to_training_data()
        self.process_text_training_data()
        self.add_latin_form_to_training_data()

        try:
            from nwae.ml.text.TxtTransform import TxtTransform
            # Conversion to padded docs
            res = TxtTransform(docs=list(self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED]),
                               labels=list(self.df_training_data[
                                   DaehuaTrainDataModel.COL_TDATA_INTENT_ID]),
                               langs=list(self.df_training_data[
                                   DaehuaTrainDataModel.COL_TDATA_TEXT_LANG])
                               ).create_padded_docs()
            Log.debug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Padded Docs: ' +
                str(res.padded_encoded_docs) + ', Labels: ' +
                str(res.encoded_labels))
            Log.debug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Labels Categorical: ' + str(res.encoded_labels_categorical))

            self.embedding_params = EmbeddingParams(
                x=res.padded_encoded_docs,
                x_original=res.original_docs,
                y=np.array(res.encoded_labels),
                y_original=res.y_original,
                x_one_hot_dict=res.x_one_hot_dict,
                y_one_hot_dict=res.y_one_hot_dict,
                max_sent_len=res.max_x_length,
                max_label_val=max(res.encoded_labels),
                vocab_size=res.vocabulary_dimension)
            Log.info(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Converted ' +
                str(len(self.embedding_params.x)) +
                ' rows padded docs. Max sentence length = ' +
                str(self.embedding_params.max_sent_len) +
                ', max label value = ' +
                str(self.embedding_params.max_label_val) +
                ', vocabulary size = ' +
                str(self.embedding_params.vocab_size) + ', x one hot dict: ' +
                str(self.embedding_params.x_one_hot_dict))
            Log.debugdebug(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Original docs:\n\r' +
                str(self.embedding_params.x_original) +
                '\n\rEncoded padded docs\n\r:' + str(self.embedding_params.x) +
                '\n\rOriginal labels\n\r' +
                str(self.embedding_params.y_original) +
                '\n\rEncoded labels\n\r' + str(self.embedding_params.y))
        except Exception as ex_embed:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Error converting to training text to embed params: ' + str(ex_embed)
            Log.warning(errmsg)
            # Don't raise error
            # raise Exception(errmsg)

        return (self.df_training_data, self.embedding_params)
Example #25
0
class LangCharacters(object):

    encoding = 'utf-8'

    def __init__(self, encoding='utf-8'):
        self.encoding = encoding
        return

    #
    # Latin
    #

    # Latin Unicode Block as 'int' list
    UNICODE_BLOCK_ORDINAL_LATIN_BASIC = tuple( range(0x0041, 0x005A+1, 1) ) +\
                                        tuple( range(0x0061, 0x007A+1, 1) )
    # Convert to Python Unicode Type list
    UNICODE_BLOCK_LATIN_BASIC = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_LATIN_BASIC])
    # Can be used interchangeably
    UNICODE_BLOCK_LATIN_AZ = UNICODE_BLOCK_LATIN_BASIC

    # Latin Extended
    UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED = tuple( range(0x00C0, 0x00F6+1, 1) ) +\
                                           tuple( range(0x00F8, 0x01BF+1, 1) ) +\
                                           tuple( range(0x01C4, 0x024F+1, 1) )
    UNICODE_BLOCK_LATIN_EXTENDED = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED])

    # All Latin
    UNICODE_BLOCK_ORDINAL_LATIN_ALL = UNICODE_BLOCK_ORDINAL_LATIN_BASIC + UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED
    UNICODE_BLOCK_LATIN_ALL = UNICODE_BLOCK_LATIN_BASIC + UNICODE_BLOCK_LATIN_EXTENDED

    # Just Latin specific to Vietnamese (actually, also French, Spanish, etc.)
    # It is actually a subset of the Latin Extended
    UNICODE_BLOCK_LATIN_VIETNAMESE =\
        tuple('ăâàằầảẳẩãẵẫáắấạặậêèềẻểẽễéếẹệìỉĩíịôơòồờỏổởõỗỡóốớọộợưùừủửũữúứụựđýỳỷỹỵ')
    # Can be used interchangeably
    UNICODE_BLOCK_LATIN_VI = UNICODE_BLOCK_LATIN_VIETNAMESE
    UNICODE_BLOCK_LATIN_VI_AZ = UNICODE_BLOCK_LATIN_VI + UNICODE_BLOCK_LATIN_AZ

    #
    # CJK
    #
    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS = tuple(
        range(0x4E00, 0x9FFF + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A = tuple(
        range(0x3400, 0x4DBF + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_A =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B = tuple(
        range(0x20000, 0x2A6DF + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_B =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C = tuple(
        range(0x2A700, 0x2B73F + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_C =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D = tuple(
        range(0x2B740, 0x2B81F + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_D =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D] )

    UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E = tuple(
        range(0x2B820, 0x2CEAF + 1, 1))
    UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_E = \
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E] )

    UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS = tuple(
        range(0xF900, 0xFAFF + 1, 1))
    UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS = \
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS] )

    UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP = tuple(
        range(0x2F800, 0x2FA1F + 1, 1))
    UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP = \
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP] )

    UNICODE_BLOCK_ORDINAL_CJK = UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A +\
                        UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS +\
                        UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C +\
                        UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D +\
                        UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E +\
                        UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP
    UNICODE_BLOCK_CJK = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK])
    # This UNICODE_BLOCK_CJK is not a unique set, there are character repeats
    # import collections
    # c = collections.Counter(UNICODE_BLOCK_CJK)
    # char_repeats = [x for x in c.keys() if c[x]>1]
    # char_repeats_unicode = [hex(ord(x)) for x in char_repeats]
    # print(char_repeats)
    # print(char_repeats_unicode)

    # TODO
    #    Some interesting notes below
    #     Case 1: Simplified Chinese takes Precedence (all characters in Simplified Chinese are surely "simplified")
    #       Historically Never Simplified Characters still in "Traditional" Hanja/Kanji/Chinese
    #          hanzidentifier.is_simplified('入') = True
    #          hanzidentifier.is_simplified('口') = True
    #       meaning there is no traditional version to these characters at all
    #       For example, in Japan you will see this Kanji '入口' (entrance) everywhere, which is the same in
    #       simplified Chinese (China/Malaysia/Singapore), traditional Chinese (Taiwan/HK) and Hanja (Hangul 입구),
    #       with exactly the same meanings.
    #       This means we cannot use this Unicode blocks to decide the language, as Japanese Kanji, Hanja,
    #       simplified/traditional Chinese will point to "simplified"
    #     Case 2: Combination vs Individual Characters
    #       Take the traditional word '辭退' (citui), and the simplified version '辞退'. If this is fed into code,
    #          hanzidentifier.is_simplified('辭退') = False
    #          hanzidentifier.is_simplified('辞退') = True
    #       But the interesting thing is the 2nd character '退' is the same in both traditional and simplified
    #          hanzidentifier.is_simplified('退') = True
    #       So this means that without first "tokenizing" the sentence, there is no way to tell.
    #       But this is chicken & egg, how to tokenize without first knowing the language?
    #       However if every character in a text is labeled "simplified", then it should be simplified Chinese
    #       & nothing else. But even this is not applicable to very short sentences like '入口'.
    # Looping over 80k symbols is fast enough, no need to worry
    try:
        UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED = tuple(
            [u for u in UNICODE_BLOCK_ORDINAL_CJK if hz.is_simplified(chr(u))])
        UNICODE_BLOCK_CJK_SIMPLIFIED = tuple(
            [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED])

        UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL = tuple([
            u for u in UNICODE_BLOCK_ORDINAL_CJK
            if not hz.is_simplified(chr(u))
        ])
        UNICODE_BLOCK_CJK_TRADITIONAL = tuple([
            chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL
        ])
        # Taking set difference will result in smaller set due to repeats in CJK
        # UNICODE_BLOCK_CJK_TRADITIONAL = tuple( set(UNICODE_BLOCK_CJK) - set(UNICODE_BLOCK_CJK_SIMPLIFIED) )
    except Exception as ex:
        Log.warning(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': Cannot get CJK simplified/traditional: ' + str(ex))
        UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED = None
        UNICODE_BLOCK_CJK_SIMPLIFIED = None

        UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL = None
        UNICODE_BLOCK_CJK_TRADITIONAL = None

    #
    # Cyrillic
    UNICODE_BLOCK_ORDINAL_CYRILLIC = tuple(range(0x0400, 0x04FF + 1, 1))
    UNICODE_BLOCK_CYRILLIC = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CYRILLIC])

    # Cyrillic Supplement
    # (Cyrillic letters for writing several minority languages,
    # including Abkhaz, Kurdish, Komi, Mordvin, Aleut, Azerbaijani,
    # and Jakovlev's Chuvash orthography)
    UNICODE_BLOCK_SUPPL_CYRILLIC = tuple(range(0x0500, 0x052F + 1, 1))
    UNICODE_BLOCK_SUPPL_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_SUPPL_CYRILLIC])

    # Cyrillic Extanded-A
    # (Cyrillic letters used in Old Church Slavonic texts)
    UNICODE_BLOCK_EXT_A_CYRILLIC = tuple(range(0x2DE0, 0x2DFF + 1, 1))
    UNICODE_BLOCK_EXT_A_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_EXT_A_CYRILLIC])

    # Cyrillic Extanded-B
    # (Cyrillic characters for writing Old Cyrillic and Old Abkhazian,
    # and combining numeric signs)
    UNICODE_BLOCK_EXT_B_CYRILLIC = tuple(range(0xA640, 0xA69F + 1, 1))
    UNICODE_BLOCK_EXT_B_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_EXT_B_CYRILLIC])

    # Cyrillic Extanded-C
    # (Cyrillic numerals)
    UNICODE_BLOCK_EXT_C_CYRILLIC = tuple(range(0x1C80, 0x1C8F + 1, 1))
    UNICODE_BLOCK_EXT_C_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_EXT_C_CYRILLIC])

    # Cyrillic Phonetic Extensions
    UNICODE_BLOCK_PHON_CYRILLIC = tuple(range(0x1D2B, 0x1D78 + 1, 1))
    UNICODE_BLOCK_EXT_PHON_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_PHON_CYRILLIC])

    # Cyrillic Combining Half Marks
    # (Unicode block containing diacritic mark parts for spanning multiple characters)
    UNICODE_BLOCK_HALF_MARKS_CYRILLIC = tuple(range(0xFE2E, 0xFE2F + 1, 1))
    UNICODE_BLOCK_HALF_MARKS_CYR = tuple(
        [chr(supl) for supl in UNICODE_BLOCK_HALF_MARKS_CYRILLIC])

    # UNICODE block for ALL cyrillic characters
    UNICODE_BLOCK_CYRILLIC_ALL = UNICODE_BLOCK_CYRILLIC + UNICODE_BLOCK_HALF_MARKS_CYR + \
                                 UNICODE_BLOCK_EXT_PHON_CYR + UNICODE_BLOCK_EXT_C_CYR + \
                                 UNICODE_BLOCK_EXT_B_CYR + UNICODE_BLOCK_EXT_A_CYR + \
                                 UNICODE_BLOCK_SUPPL_CYR

    #
    # Hangul
    #
    # This is the 11xx jamo code block, when computer sees a sequence of these jamos, they combine
    # them into Hangul syllables (or just Hangul) in the block below.
    # print(chr(0x110c) + chr(0x1161) + chr(0x1106) + chr(0x1169))
    UNICODE_BLOCK_ORDINAL_HANGUL_JAMO = tuple(range(0x1100, 0x11FF + 1, 1))
    UNICODE_BLOCK_HANGUL_JAMO = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_JAMO])
    # This is the 31xx hangul compatibility jamo block,
    # when computer sees a sequence of these jamos, they print out individually, without combining into Hangul syllables
    # print(chr(0x3148) + chr(0x314f) + chr(0x3141) + chr(0x3157))
    UNICODE_BLOCK_ORDINAL_HANGUL_COMPATIBILITY_JAMO = tuple(
        range(0x3130, 0x318F + 1, 1))
    UNICODE_BLOCK_HANGUL_COMPATIBILITY_JAMO = tuple([
        chr(ordinal)
        for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_COMPATIBILITY_JAMO
    ])
    # This block is for Hangul syllables (or just Hangul). E.g. '한', '굴', '자' '모'
    # whereas the above blocks are for single 자모 (字母 or alphabet).
    UNICODE_BLOCK_ORDINAL_HANGUL_SYLLABLE = tuple(range(0xAC00, 0xD7AF + 1, 1))
    UNICODE_BLOCK_HANGUL_SYLLABLE = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_SYLLABLE])

    UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE = \
        UNICODE_BLOCK_HANGUL_JAMO + UNICODE_BLOCK_HANGUL_COMPATIBILITY_JAMO + UNICODE_BLOCK_HANGUL_SYLLABLE

    #
    # Japanese Hiragana/Katakana
    #
    UNICODE_BLOCK_ORDINAL_HIRAGANA = tuple(range(0x3040, 0x309F + 1, 1))
    UNICODE_BLOCK_HIRAGANA = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HIRAGANA])
    UNICODE_BLOCK_ORDINAL_KATAKANA = tuple(range(0x30A0, 0x30FF + 1, 1))
    UNICODE_BLOCK_KATAKANA = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_KATAKANA])

    UNICODE_BLOCK_HIRAGANA_KATAKANA = UNICODE_BLOCK_HIRAGANA + UNICODE_BLOCK_KATAKANA
    UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI = \
        UNICODE_BLOCK_HIRAGANA + UNICODE_BLOCK_KATAKANA + UNICODE_BLOCK_CJK

    #
    # Thai
    # From http://sites.psu.edu/symbolcodes/languages/asia/thai/thaichart/
    #
    UNICODE_BLOCK_ORDINAL_THAI_CONSONANTS = tuple(range(0x0E01, 0x0E2E + 1, 1))
    UNICODE_BLOCK_THAI_CONSONANTS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_CONSONANTS])

    # The character ' ็' or chr(0x0E47) is unique, a consonant must appear before it, and another consonant after it
    # ['ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', '็']
    UNICODE_BLOCK_ORDINAL_THAI_VOWELS_AFTER_CONSONANT = \
        tuple( range(0x0E30, 0x0E3A+1, 1) ) + tuple( range(0x0E47, 0x0E47+1, 1) )
    UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT =\
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_VOWELS_AFTER_CONSONANT] )

    # The character ' ็' or chr(0x0E47) is unique, a consonant must appear before it, and another consonant after it
    # ['เ', 'แ', 'โ', 'ใ', 'ไ', '็']
    UNICODE_BLOCK_ORDINAL_THAI_VOWELS_BEFORE_CONSONANT = \
        tuple( range(0x0E40, 0x0E44+1, 1) ) + tuple( range(0x0E47, 0x0E47+1, 1) )
    UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT = \
        tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_VOWELS_BEFORE_CONSONANT] )

    # Tone marks cannot be start of word (same with "vowels-after-consonant")
    # ['่', '้', '๊', '๋']
    UNICODE_BLOCK_ORDINAL_THAI_TONEMARKS = tuple(range(0x0E48, 0x0E4B + 1, 1))
    UNICODE_BLOCK_THAI_TONEMARKS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_TONEMARKS])

    UNICODE_BLOCK_ORDINAL_THAI_NUMBERS = tuple(range(0x0E50, 0x0E59 + 1, 1))
    UNICODE_BLOCK_THAI_NUMBERS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_NUMBERS])

    UNICODE_BLOCK_ORDINAL_THAI_SIGNS_PUNCTUATIONS = tuple( range(0x0E2F, 0x0E2F+1, 1) ) +\
                                                    tuple( range(0x0E45, 0x0E46+1, 1) ) +\
                                                    tuple( range(0x0E4C, 0x0E4F+1, 1) ) +\
                                                    tuple( range(0x0E5A, 0x0E5B+1, 1) )
    UNICODE_BLOCK_THAI_SIGNS_PUNCTUATIONS = tuple([
        chr(ordinal)
        for ordinal in UNICODE_BLOCK_ORDINAL_THAI_SIGNS_PUNCTUATIONS
    ])

    UNICODE_BLOCK_THAI = UNICODE_BLOCK_THAI_CONSONANTS +\
                         UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT +\
                         UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT +\
                         UNICODE_BLOCK_THAI_TONEMARKS +\
                         UNICODE_BLOCK_THAI_NUMBERS +\
                         UNICODE_BLOCK_THAI_SIGNS_PUNCTUATIONS

    #
    # Punctuations, etc.
    #
    UNICODE_BLOCK_WORD_SEPARATORS =\
        tuple(u' ,!.?()[]:;"«»\'') + tuple(u'?。,()') + tuple([chr(0xFF0C),chr(0xFF01),chr(0xFF0E),chr(0xFF1F)])

    UNICODE_BLOCK_SENTENCE_SEPARATORS =\
        tuple(u' !.?') + tuple([chr(0xFF01),chr(0xFF0E),chr(0xFF1F)])
    #
    # Numbers: normal Latin and CJK halfwidth/fullwidth
    #
    UNICODE_BLOCK_ORDINAL_NUMBERS = tuple(range(
        0x0030, 0x0039 + 1, 1)) + tuple(range(0xFF10, 0xFF19 + 1, 1))
    UNICODE_BLOCK_NUMBERS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_NUMBERS])

    #
    # Punctuations Only (Half-Width & Full-Width Forms)
    #
    UNICODE_BLOCK_ORDINAL_PUNCTUATIONS = tuple(range(0x0000, 0x007F+1, 1)) +\
                                         tuple(range(0x2000, 0x206F+1, 1)) +\
                                         tuple(range(0x3000, 0x303F+1, 1)) +\
                                         tuple(range(0xFF00, 0xFF0F+1, 1)) +\
                                         tuple(range(0xFF1A, 0xFF20+1, 1)) +\
                                         tuple(range(0xFF3B, 0xFF40+1, 1)) +\
                                         tuple(range(0xFF5B, 0xFF65+1, 1))
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_PUNCTUATIONS])
    # Remove non-punctuations from original list of punctuations
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_LATIN_ALL))
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_WORD_SEPARATORS))
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        set(UNICODE_BLOCK_PUNCTUATIONS) -
        set(UNICODE_BLOCK_SENTENCE_SEPARATORS))
    UNICODE_BLOCK_PUNCTUATIONS = tuple(
        set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_NUMBERS))

    #
    # Get the valid Unicode Block for a given language
    #
    @staticmethod
    def get_language_charset(lang):
        # lang_std = lf.LangFeatures.map_to_lang_code_iso639_1(
        #     lang_code = lang
        # )
        lang_std = lf.LangFeatures.map_to_correct_lang_code_iso_639_1_or_3(
            lang_code=lang)

        if lang_std in [lf.LangFeatures.LANG_EN, lf.LangFeatures.LANG_VI]:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        if lang == lf.LangFeatures.LANG_ZH:
            return LangCharacters.UNICODE_BLOCK_CJK
        elif lang == lf.LangFeatures.LANG_TH:
            return LangCharacters.UNICODE_BLOCK_THAI
        elif lang == lf.LangFeatures.LANG_KO:
            return LangCharacters.UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE
        elif lang == lf.LangFeatures.LANG_JA:
            return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI
        else:
            return []

    @staticmethod
    def get_alphabet_charset(alphabet):
        #
        # Latin Type Blocks (English, Spanish, French, Vietnamese, etc.)
        # TODO Break into other language variants (done)
        #
        if alphabet == lf.LangFeatures.ALPHABET_LATIN_AZ:
            return LangCharacters.UNICODE_BLOCK_LATIN_AZ
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_VI:
            return LangCharacters.UNICODE_BLOCK_LATIN_VI + LangCharacters.UNICODE_BLOCK_LATIN_VIETNAMESE
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_VI_AZ:
            return LangCharacters.UNICODE_BLOCK_LATIN_VI + LangCharacters.UNICODE_BLOCK_LATIN_AZ
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL

        # Latin type blocks:
        # French;
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_FR:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        # Czech;
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_CZECH:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        # German
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_GERMAN:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        # Spanish
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_SPANISH:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL
        # English
        elif alphabet == lf.LangFeatures.ALPHABET_LATIN_ENG:
            return LangCharacters.UNICODE_BLOCK_LATIN_ALL

        # CJK Type Blocks (Korean, Chinese, Japanese)
        # TODO Break into Chinese variants (simplified, traditional, etc.),
        #   Japanese, Hanja, etc. (done)

        elif alphabet == lf.LangFeatures.ALPHABET_HANGUL:
            return LangCharacters.UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE
        elif alphabet == lf.LangFeatures.ALPHABET_CJK:
            return LangCharacters.UNICODE_BLOCK_CJK
        elif alphabet == lf.LangFeatures.ALPHABET_CJK_SIMPLIFIED:
            return LangCharacters.UNICODE_BLOCK_CJK_SIMPLIFIED
        elif alphabet == lf.LangFeatures.ALPHABET_CJK_TRADITIONAL:
            return LangCharacters.UNICODE_BLOCK_CJK_TRADITIONAL
        elif alphabet == lf.LangFeatures.ALPHABET_HIRAGANA_KATAKANA:
            return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA
        elif alphabet == lf.LangFeatures.ALPHABET_JAPANESE:
            return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI
        #
        # Cyrillic Blocks (Russian, Belarusian, Ukrainian, etc.)
        #
        elif alphabet == lf.LangFeatures.ALPHABET_CYRILLIC:
            return LangCharacters.UNICODE_BLOCK_CYRILLIC_ALL
        #
        # Other Blocks
        #
        elif alphabet == lf.LangFeatures.ALPHABET_THAI:
            return LangCharacters.UNICODE_BLOCK_THAI

    @staticmethod
    def get_alphabet_charset_all():
        alphabet_dict = {}
        for alp in lf.LangFeatures.ALPHABETS_ALL:
            alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet=alp)
        return alphabet_dict

    #
    # Given a string with allowed Unicode block, returns a string with only the allowed Unicode values
    #
    def filter_allowed_characters(self,
                                  unicode_list,
                                  s,
                                  include_word_separators=True,
                                  include_sentence_separators=True,
                                  include_numbers=True,
                                  include_punctuations=True):
        # Just in case user passes in the immutable tuples
        allowed_list = list(unicode_list).copy()

        if include_word_separators:
            allowed_list += LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS
        if include_sentence_separators:
            allowed_list += LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS
        if include_numbers:
            allowed_list += [c for c in '0123456789']
        if include_punctuations:
            allowed_list += LangCharacters.UNICODE_BLOCK_PUNCTUATIONS

        str_new = [c for c in s if (c in allowed_list)]
        return ''.join(str_new)

    #
    # This function returns whether the written language is normal Vietnamese (a mix of basic and extended Latin)
    # or purely using basic Latin (it is cultural of Vietnamese to leave out all the diacritics and use purely basic
    # Latin)
    #
    def get_vietnamese_type(self, s):
        # Must convert string to unicode string
        #if type(s) != unicode:
        #s = unicode(s, encoding=self.encoding)

        # First we remove the punctuations, numbers, etc.
        remove_block = LangCharacters.UNICODE_BLOCK_PUNCTUATIONS + LangCharacters.UNICODE_BLOCK_NUMBERS + \
                       LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS + LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS
        ss = u''
        for i in range(0, len(s), 1):
            if s[i] not in remove_block:
                ss = ss + s[i]

        is_latin_basic_count = 0
        is_latin_extended_viet_count = 0
        for i in range(0, len(ss), 1):
            latin_basic = ss[i] in LangCharacters.UNICODE_BLOCK_LATIN_BASIC
            latin_extended = ss[
                i] in LangCharacters.UNICODE_BLOCK_LATIN_EXTENDED
            # print ( ss[i] + " Latin Basic = " + str(latin_basic) + ", Latin Extended = " + str(latin_extended) )
            is_latin_basic_count += 1 * latin_basic
            is_latin_extended_viet_count += 1 * latin_extended

        latin_basic_percent = float(is_latin_basic_count / float(len(ss)))
        latin_extended_viet_percent = float(is_latin_extended_viet_count /
                                            float(len(ss)))

        if (latin_basic_percent + latin_extended_viet_percent) > 0.5:
            if latin_basic_percent > 0.98:
                return "latin.basic"
            else:
                if latin_extended_viet_percent > 0.1:
                    return "latin.viet"
                else:
                    return "latin.mix"
        else:
            return "other"

    #
    # Converts a string into a single number for various purposes when dealing with numbers are more convenient.
    # This single number is not necessarily unique.
    #
    def convert_string_to_number(self, s, verbose=0):

        lang = None
        syllable_end = [False] * len(s)

        if s[0] in self.UNICODE_BLOCK_THAI and len(s) > 1:
            # For Thai, we don't calculate the last syllable character, since that character is highly prone
            # to error. E.g. ส-วัด-ดี (สวัสดี) or ปัน-หา (ปัญหา). This is also our method of Thai spelling correction.
            # Characters that can never be start of syllable
            not_start_syllable_char = self.UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT + \
                                      self.UNICODE_BLOCK_THAI_TONEMARKS
            lang = lf.LangFeatures.LANG_TH
            char_prev = s[0]
            for i in range(1, len(s) - 1, 1):
                char_prev = s[i - 1]
                char_cur = s[i]

                # This character can never be start of syllable
                if char_cur not in not_start_syllable_char:
                    continue

                char_next = s[i + 1]
                # Case of 'เดือน', 'เมื่อ', 'เลข', etc.
                if char_cur in self.UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT:
                    syllable_end[i - 1] = True
                elif char_cur in self.UNICODE_BLOCK_THAI_CONSONANTS:
                    # Case of 'การ', 'เดือน', 'ดารา' etc.
                    if (char_next in self.UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT) and \
                            (char_prev not in self.UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT):
                        syllable_end[i - 1] = True
                    # Case of 'งง', 'สด', etc.
                    # elif ( char_prev in LangCharacters.UNICODE_BLOCK_THAI_TONEMARKS ):
                    #    syllable_end[i-1] = True

            # Last character is always end of syllable
            syllable_end[len(s) - 1] = True

            if verbose >= 1:
                sylsepstring = ''
                for i in range(0, len(s), 1):
                    sylsepstring = sylsepstring + s[i]
                    if syllable_end[i]:
                        sylsepstring = sylsepstring + ' '
                print(sylsepstring)

        x = 0
        index = 1
        # A string "abc" will be calculated as (97 + 2*98 + 3*99), Unicode for 'a' is 97, 'b' is 98, 'c' is 99
        for i in range(0, len(s), 1):
            # We don't include a syllable ending consonant for Thai in the measure, since this character is prone
            # to spelling mistakes
            ignore = False
            if lang == lf.LangFeatures.LANG_TH:
                if s[i] in LangCharacters.UNICODE_BLOCK_THAI_CONSONANTS and syllable_end[
                        i]:
                    # print('Ignore ' + s[i])
                    ignore = True
            if not ignore:
                un = ord(s[i])
                # print('Index ' + str(index) + ', ' + s[i] + ', ' + str(un))
                x = x + index * un
                index = index + 1

        return x
Example #26
0
import nwae.lang.nlp.SynonymList as slist
from nwae.lang.preprocessing.BasicPreprocessor import BasicPreprocessor
from nwae.utils.Log import Log
from inspect import currentframe, getframeinfo
# Library to convert Traditional Chinese to Simplified Chinese
import hanziconv as hzc
import nwae.utils.Profiling as prf
import re
try:
    """
    Japanese word segmentation
    """
    import nagisa
except Exception as ex:
    Log.warning(
        str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
        + ': Error importing libraries for japanese tokenization: ' + str(ex)
    )
try:
    """
    Korean word segmentation
    There are many problems with this library kkma, firstly it requires external JVM, it is quite slow
    and will also split wrong (e.g. '탈레반이' will be split wrongly to '탈', '레', '반이') or not in our
    desired application way (e.g. '장악한' split to '장악', '하', 'ㄴ')
    We should write our own, korean language is quite systematic, and we could control the following
      - by default a whole word '탈레반이' if not recognized should just keep as is, and split out only
        particles like '이'
      - naturally in most application the word '장악한' (verb) should not be split to
          ('장악', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD')
        and only stemming or lemmatization should bring it to '장악하다' (verb) and not '장악' (noun)
    """
    from konlpy.tag import Kkma
Example #27
0
import nwae.ml.networkdesign.NetworkDesign as nwdesign
import pandas as pd
import numpy as np
from datetime import datetime
import os
import sys

# TODO Don't rely on buggy TF/Keras, write our own
try:
    from keras.utils import to_categorical
    from tensorflow.keras.models import load_model
    # This one will not work in a multi-threaded environment
    #from keras.models import load_model
except Exception as ex_keras:
    Log.warning(
        str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
        + ': Exception importing Keras modules: ' + str(ex_keras)
    )


class NnDenseModel(ModelInterface):

    MODEL_NAME = 'nn_dense'

    CONFIDENCE_LEVEL_SCORES_DEFAULT = {1: 10, 2: 15, 3: 20, 4:30, 5:40}

    def __init__(
            self,
            # NN layer configurations, etc.
            model_params,
            # Unique identifier to identify this set of trained data+other files after training
            identifier_string,
Example #28
0
    def convert_product_to_attributes(
            self,
            # Датафрейм с покупателями и продуктами
            df_product,
            # Столцы которые определяют уникальных клиентов
            unique_human_key_columns,
            unique_product_key_column,
            # Либо цена продуктов или количество
            unique_product_value_column,
            normalize_method = NORMALIZE_METHOD_NONE,
            # уменшить количество атрибутов
            max_attribute_columns = 0,
            # по проценту, убирать продуктов меньше такого квартиля
            filter_out_quantile_byvalue = 0.0,
            filter_out_quantile_bycount = 0.0,
            # Before any processing
            transform_prd_values_method = TRANSFORM_PRD_VALUES_METHOD_NONE,
            transform_logbase           = 10.0,
            transform_after_aggregate   = True, # By default transform only AFTER aggregation
            # осторожно здесь, этот неизвестный продукт будет присвоен 0-вектор (возможно),
            # поэтому если использовать метрику "euclidean", он будет "близок" другим векторам
            add_unknown_product = False,
    ):
        transform_before = self.TRANSFORM_PRD_VALUES_METHOD_NONE
        if not transform_after_aggregate:
            transform_before = transform_prd_values_method
        df_prd_agg, unique_product_list, unique_human_list = self.aggregate_products(
            df_product                  = df_product,
            unique_human_key_columns    = unique_human_key_columns,
            unique_product_key_column   = unique_product_key_column,
            unique_product_value_column = unique_product_value_column,
            transform_prd_values_method = transform_before,
            transform_logbase           = transform_logbase,
        )

        #
        # Этот шаг приведет к несколько проблемам, так как у некоторых клиентов будет превратить к вектору нулей
        #
        is_reduced = False
        unique_remaining_products = None
        np_remaining_attributes = None
        if (max_attribute_columns > 0) | (filter_out_quantile_byvalue > 0.0) | (filter_out_quantile_bycount > 0.0):
            byval_unique_top_products_by_order, byval_unique_remaining_products = self.find_top_products(
                df_product                  = df_product,
                unique_product_key_column   = unique_product_key_column,
                unique_product_value_column = unique_product_value_column,
                # Do max filtering later
                top_x                       = 0,
                filter_out_quantile         = filter_out_quantile_byvalue,
                aggregate_method            = 'sum',
            )
            bycnt_unique_top_products_by_order, bycnt_unique_remaining_products = self.find_top_products(
                df_product                  = df_product,
                unique_product_key_column   = unique_product_key_column,
                unique_product_value_column = unique_product_value_column,
                # Do max filtering later
                top_x                       = 0,
                filter_out_quantile         = filter_out_quantile_bycount,
                aggregate_method            = 'count',
            )
            filtered_out_products_by_count_condition = [
                prd for prd in byval_unique_top_products_by_order
                if prd not in bycnt_unique_top_products_by_order
            ]
            Log.info(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Products filtered out by 2nd count condition ' + str(filtered_out_products_by_count_condition)
            )
            # Убирать товары, не выполняется второе условие
            unique_top_products_by_order = [
                prd for prd in byval_unique_top_products_by_order
                if prd in bycnt_unique_top_products_by_order
            ]
            unique_remaining_products = filtered_out_products_by_count_condition + byval_unique_remaining_products
            assert len(unique_top_products_by_order) + len(unique_remaining_products) == len(unique_product_list), \
                'Length of unique top products and remaining products must equal length of product list'
            Log.info(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Remaining ' + str(len(unique_remaining_products))
                + ' least products: ' + str(unique_remaining_products)
            )
            if max_attribute_columns > 0:
                max_final = min(len(unique_top_products_by_order), max_attribute_columns)
                if max_final < len(unique_top_products_by_order):
                    filtered_out_by_max_attribute_products = unique_top_products_by_order[max_final:]
                    unique_top_products_by_order = unique_top_products_by_order[0:max_final]
                    unique_remaining_products = filtered_out_by_max_attribute_products + unique_remaining_products
                    assert len(unique_top_products_by_order) + len(unique_remaining_products) == len(unique_product_list), \
                        'Length of unique top products and remaining products must equal length of product list'

            # Change the removed product names to one name
            def change_name(prdname):
                if prdname in unique_remaining_products:
                    return self.COLNAME_PRODUCTS_NOT_INCLUDED
                else:
                    return prdname

            df_prd_agg[unique_product_key_column] = df_prd_agg[unique_product_key_column].apply(func=change_name)
            unique_product_list = unique_top_products_by_order + [self.COLNAME_PRODUCTS_NOT_INCLUDED]
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': After truncation, total unique products as attributes = ' + str(len(unique_product_list))
                + '. Products: ' + str(unique_product_list)
            )
            # Need to regroup again, since each pair member-COLNAME_PRODUCTS_NOT_INCLUDED will appear on multiple lines
            shape_ori = df_prd_agg.shape
            df_prd_agg = df_prd_agg.groupby(
                by=unique_human_key_columns + [unique_product_key_column],
                as_index=False,
            ).sum()
            Log.info(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': After second round grouping by human/product columns, from shape ' + str(shape_ori)
                + ' to new shape ' + str(df_prd_agg.shape)
            )

        if transform_after_aggregate:
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': After aggregation, transform product values by "' + str(transform_prd_values_method) + '"'
            )
            if transform_prd_values_method == self.TRANSFORM_PRD_VALUES_METHOD_UNITY:
                df_prd_agg[unique_product_value_column] = 1.0 * (df_prd_agg[unique_product_value_column] > 0.0)
            elif transform_prd_values_method == self.TRANSFORM_PRD_VALUES_METHOD_LOG:
                df_prd_agg[unique_product_value_column] = np.log(1 + df_prd_agg[unique_product_value_column]) / np.log(transform_logbase)
            else:
                pass

        """Датафрейм лишь с столбцом(ами) покупателей"""
        df_converted = df_prd_agg[unique_human_key_columns]
        df_converted = df_converted.drop_duplicates()
        Log.debugdebug(df_converted)
        assert len(df_converted) == len(unique_human_list), \
            'Length of final dataframe ' + str(len(df_converted)) + ' must be equal ' + str(len(unique_human_list))

        # прибавить "неизвестный продукт"
        if add_unknown_product:
            if self.NAN_PRODUCT in unique_product_list:
                raise Exception(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Name clash with nan product name "' + str(self.NAN_PRODUCT) + '"'
                )
            unique_product_list.append(self.NAN_PRODUCT)

        columns_running = list(df_converted.columns)
        """Продукт за продуктом, создавать столбец продукта"""
        for prd in unique_product_list:
            condition_only_this_product = df_prd_agg[unique_product_key_column] == prd
            df_prd_agg_part = df_prd_agg[condition_only_this_product]
            if len(df_prd_agg_part) == 0:
                Log.warning(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': For product "' + str(prd) + '", there are 0 sales, adding 0 column'
                )
                df_converted[prd] = 0.0
            else:
                columns_keep = unique_human_key_columns + [unique_product_value_column]
                df_prd_agg_part = df_prd_agg_part[columns_keep].reset_index(drop=True)
                Log.debugdebug(prd)
                Log.debugdebug(df_prd_agg_part)
                """Соединить цену/количество продукта с человеком"""
                df_converted = df_converted.merge(
                    df_prd_agg_part,
                    on  = unique_human_key_columns,
                    how = 'left'
                )
                assert len(df_converted) == len(unique_human_list), \
                    'After merge column "' + str(prd) + '" Length of final dataframe ' + str(
                        len(df_converted)) + ' must be equal ' + str(len(unique_human_list))

            """Новые имена столбцев"""
            columns_running = columns_running + [prd]
            df_converted.columns = columns_running
            df_converted[prd] = df_converted[prd].fillna(0.0)
            Log.debugdebug(df_converted)

        assert len(df_converted) == len(unique_human_list), \
            'Length of final dataframe ' + str(len(df_converted)) + ' must be equal ' + str(len(unique_human_list))

        if is_reduced:
            df_converted[self.COLNAME_PRODUCTS_NOT_INCLUDED] = np_remaining_attributes

        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Final human-product attributes shape: ' + str(df_converted.shape)
        )
        Log.debugdebug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Final mapped human-product attributes: '
        )
        Log.debugdebug(df_converted)

        original_cols = list(df_converted.columns)
        attr_cols = original_cols.copy()
        for col in unique_human_key_columns:
            attr_cols.remove(col)

        df_converted = self.normalize(
            df                = df_converted,
            name_columns      = unique_human_key_columns,
            attribute_columns = attr_cols,
            normalize_method  = normalize_method,
        )

        return df_converted, unique_product_list
Example #29
0
    def load_text_processor(self):
        try:
            self.load_text_processor_mutex.acquire()
            # Don't allow to load again
            if self.model_last_reloaded_counter == self.model.get_model_reloaded_counter(
            ):
                Log.warning(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': Model "' +
                    str(self.identifier_string) +
                    '" not reloading PredictClassTxtProcessor.')
                return

            Log.info(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Model "' +
                str(self.model_name) +
                '" ready. Loading synonym & word lists..')

            self.lang_detect = LangDetect()

            self.predict_class_txt_processor = {}
            for uh in [self.lang_main] + self.lang_additional:
                try:
                    model_features_list = self.model.get_model_features(
                    ).tolist()
                except Exception as ex_feature_list:
                    Log.error(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': Model "' + str(self.model_name) + '" identifier "' +
                        str(self.identifier_string) +
                        '" model feature list empty!')
                    model_features_list = None

                self.predict_class_txt_processor[uh] = TxtPreprocessor(
                    identifier_string=self.identifier_string,
                    dir_path_model=self.dir_path_model,
                    model_features_list=model_features_list,
                    lang=uh,
                    dirpath_synonymlist=self.dirpath_synonymlist,
                    postfix_synonymlist=self.postfix_synonymlist,
                    dir_wordlist=self.dir_wordlist,
                    postfix_wordlist=self.postfix_wordlist,
                    dir_wordlist_app=self.dir_wordlist_app,
                    postfix_wordlist_app=self.postfix_wordlist_app,
                    # TODO For certain languages like English, it is essential to include this
                    #   But at the same time must be very careful. By adding manual rules, for
                    #   example we include words 'it', 'is'.. But "It is" could be a very valid
                    #   training data that becomes excluded wrongly.
                    stopwords_list=None,
                    do_spelling_correction=self.do_spelling_correction,
                    do_word_stemming=self.do_word_stemming,
                    do_profiling=self.do_profiling)

            self.is_all_initializations_done = True
            # Manually update this model last reloaded counter
            self.model_last_reloaded_counter = self.model.get_model_reloaded_counter(
            )
            Log.important(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Model name "' +
                str(self.model_name) + '", identifier "' +
                str(self.identifier_string) +
                '" All initializations done for model "' +
                str(self.identifier_string) + '". Model Reload counter = ' +
                str(self.model_last_reloaded_counter))
        except Exception as ex:
            errmsg = \
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                + ': Model name "' + str(self.model_name) \
                + '", identifier "' + str(self.identifier_string) \
                + '" Exception initializing synonym & word lists: ' + str(ex)
            Log.critical(errmsg)
            raise Exception(errmsg)
        finally:
            self.load_text_processor_mutex.release()
Example #30
0
    def __init__(
            self,
            write_lang_features_to_csv = False
    ):
        #
        # Language followed by flag for alphabet boundary, syllable boundary (either as one
        # character as in Chinese or space as in Korean), then word boundary (space)
        # The most NLP-inconvenient languages are those without word boundary, obviously.
        # Name, Code, Alphabet, CharacterType, SyllableSeparator, SyllableSeparatorType, WordSeparator, WordSeparatorType
        #
        # We need to define our own properties as even ISO 15924 specification does not contain them
        #
        # Hangul/CJK Language Family
        #
        try:
            self.PYCLANG = pycountry.languages
        except Exception as ex:
            Log.warning(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Cannot load pycountry languages: ' + str(ex)
            )
            self.PYCLANG = None

        lang_index = 0
        lang_ko = {
            self.C_LANG_ID:        self.LANG_KO,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Hangul',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_HANGUL,
            self.C_HAVE_SYL_SEP:   True,
            # TODO Not really right to say it is char but rather a "syllable_character"
            self.C_SYL_SEP_TYPE:   self.T_CHAR,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        #
        # CJK Alphabet Family
        #
        lang_index += 1
        lang_zh = {
            self.C_LANG_ID:        self.LANG_ZH,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Chinese',
            self.C_HAVE_ALPHABET:  False,
            self.C_CHAR_TYPE:      self.ALPHABET_CJK,
            self.C_HAVE_SYL_SEP:   True,
            self.C_SYL_SEP_TYPE:   self.T_CHAR,
            self.C_HAVE_WORD_SEP:  False,
            self.C_WORD_SEP_TYPE:  self.T_NONE,
            self.C_HAVE_VERB_CONJ: False
        }
        #
        # Japanese Hiragana/Katakana
        #
        lang_index += 1
        lang_ja = {
            self.C_LANG_ID:        self.LANG_JA,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Japanese',
            self.C_HAVE_ALPHABET:  False,
            self.C_CHAR_TYPE:      self.ALPHABET_JAPANESE,
            self.C_HAVE_SYL_SEP:   True,
            self.C_SYL_SEP_TYPE:   self.T_CHAR,
            self.C_HAVE_WORD_SEP:  False,
            self.C_WORD_SEP_TYPE:  self.T_NONE,
            self.C_HAVE_VERB_CONJ: True
        }
        #
        # Cyrillic Alphabet Family
        #
        lang_index += 1
        lang_ru = {
            self.C_LANG_ID:        self.LANG_RU,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Russian',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_CYRILLIC,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }

        #
        # Thai Alphabet Family
        #

        lang_index += 1
        lang_th = {
            self.C_LANG_ID:        self.LANG_TH,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Thai',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_THAI,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  False,
            self.C_WORD_SEP_TYPE:  self.T_NONE,
            self.C_HAVE_VERB_CONJ: False
        }
        #
        # Latin Alphabet Family
        #
        lang_index += 1
        lang_en = {
            self.C_LANG_ID:        self.LANG_EN,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'English',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN_AZ,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_es = {
            self.C_LANG_ID:        self.LANG_ES,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Spanish',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_fr = {
            self.C_LANG_ID:        self.LANG_FR,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'French',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_de = {
            self.C_LANG_ID:        self.LANG_DE,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'German',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_it = {
            self.C_LANG_ID:        self.LANG_IT,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Italian',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }
        lang_index += 1
        lang_nl = {
            self.C_LANG_ID:        self.LANG_NL,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Dutch',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }

        lang_index += 1
        lang_vi = {
            self.C_LANG_ID:        self.LANG_VI,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Vietnamese',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN_VI_AZ,
            self.C_HAVE_SYL_SEP:   True,
            self.C_SYL_SEP_TYPE:   self.T_SPACE,
            self.C_HAVE_WORD_SEP:  False,
            self.C_WORD_SEP_TYPE:  self.T_NONE,
            self.C_HAVE_VERB_CONJ: False
        }
        lang_index += 1
        lang_id = {
            self.C_LANG_ID:        self.LANG_ID,
            self.C_LANG_NUMBER:    lang_index,
            self.C_LANG_NAME:      'Indonesian',
            self.C_HAVE_ALPHABET:  True,
            self.C_CHAR_TYPE:      self.ALPHABET_LATIN_AZ,
            self.C_HAVE_SYL_SEP:   False,
            self.C_SYL_SEP_TYPE:   self.T_NONE,
            self.C_HAVE_WORD_SEP:  True,
            self.C_WORD_SEP_TYPE:  self.T_SPACE,
            self.C_HAVE_VERB_CONJ: True
        }

        self.langs = {
            # Hangul/CJK
            self.LANG_KO: lang_ko,
            self.LANG_JA: lang_ja,
            # CJK
            self.LANG_ZH: lang_zh,
            # Cyrillic
            self.LANG_RU: lang_ru,
            # Thai
            self.LANG_TH: lang_th,
            # Latin
            self.LANG_EN: lang_en,
            self.LANG_ES: lang_es,
            self.LANG_FR: lang_fr,
            self.LANG_DE: lang_de,
            self.LANG_IT: lang_it,
            self.LANG_NL: lang_nl,
            self.LANG_VI: lang_vi,
            self.LANG_ID: lang_id,
        }
        assert lang_index+1 == len(self.langs)

        # Add ISO 639-2 definitions
        for lang in self.langs.keys():
            if self.PYCLANG is not None:
                lang_639 = self.PYCLANG.get(alpha_2=lang)
                self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_3] = lang_639.alpha_3
                self.langs[lang][LangFeatures.C_LANG_639_2_NAME]    = lang_639.name
                self.langs[lang][LangFeatures.C_LANG_639_2_SCOPE]   = lang_639.scope
                self.langs[lang][LangFeatures.C_LANG_639_2_TYPE]    = lang_639.type
                try:
                    self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = lang_639.alpha_2
                except Exception:
                    self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = ''
                try:
                    self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = lang_639.bibliographic
                except Exception:
                    self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = ''
            else:
                self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_3] = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_NAME]    = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_SCOPE]   = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_TYPE]    = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = ''
                self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO]  = ''

        # Copy 2-letter keys (ISO 639-1) to also 3-letter keys (ISO 639-3)
        # Means we can access the language structure using either ISO 639-1 or ISO 639-3
        # If engineering standard ISO had been more far-sighted (after all 26*26=676 only)
        # we would not have to do this
        new_items = {}
        for key in self.langs.keys():
            lang_iso_699_3 = self.langs[key][LangFeatures.C_LANG_639_2_ALPHA_3]
            if key != lang_iso_699_3:
                lang_dict = self.langs[key].copy()
                # Change lang id to 3-letter ISO 639-1
                lang_dict[self.C_LANG_ID] = lang_iso_699_3
                new_items[lang_iso_699_3] = lang_dict
        for lang_id3 in new_items:
            self.langs[lang_id3] = new_items[lang_id3]

        self.langfeatures = pd.DataFrame(
            self.langs.values()
        )
        # Конечно более удобно хранить данные в csv файле..
        # но проблема с путем файла и тп будет очень неприятна пользователем
        if write_lang_features_to_csv:
            self.langfeatures = self.langfeatures.sort_values(by=[self.C_LANG_NAME], ascending=True)
            self.langfeatures.to_csv('lang_features.csv', sep=',', index=False)
        return