def wait_for_all_initializations_to_be_done(self, wait_max_time=10): if self.is_all_initializations_done: return count = 1 sleep_time_wait_initializations = 0.1 while not self.is_all_initializations_done: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model not yet fully initialized, sleep for ' + str(count * sleep_time_wait_initializations) + ' secs now..') if count * sleep_time_wait_initializations > wait_max_time: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Waited too long ' + str(count * sleep_time_wait_initializations)\ + ' secs. Raising exception..' raise Exception(errmsg) time.sleep(sleep_time_wait_initializations) count = count + 1 Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initializations all done for model "' + str(self.identifier_string) + '" READY.') return
def send(self, user, password, recipients_list, message): try: if password not in [None, '']: self.server.login(user=user, password=password) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Login for user "' + str(user) + '" successful.') else: # If no password passed in, no need to do login Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not doing login for user "' + str(user) + '", no password given "' + str(password) + '"') self.server.sendmail(from_addr=user, to_addrs=recipients_list, msg=message) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Message from ' + str(user) + ' to ' + str(recipients_list) + ' sent successfully. Closing server..') self.server.close() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Mail server "' + str(self.mail_server_url) + '" closed') except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Exception sending mail from ' + str(user) + ' to ' + str(recipients_list)\ + '. Got exception ' + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg)
def __init_smtp(self): Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Trying to initialize mail server "' + str(self.mail_server_url) + '" port ' + str(self.mail_server_port) + ' using mode "' + str(self.mode) + '"...' ) if self.mode == self.MAIL_MODE_SSL: # Create a secure SSL context # self.context = ssl.create_default_context() self.server = smtplib.SMTP_SSL( host=self.mail_server_url, port=self.mail_server_port, # context=self.context ) self.server.ehlo() elif self.mode == self.MAIL_MODE_SMTP: self.server = smtplib.SMTP(host=self.mail_server_url, port=self.mail_server_port) self.server.ehlo() else: self.server = smtplib.SMTP(host=self.mail_server_url, port=self.mail_server_port) self.server.ehlo() self.server.starttls() Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': SMTP mode "' + str(self.mode) + '" successfully initialized.' ) return
def add_parent(self, parent): if parent.dead_node: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Parent "' + str(parent.name) + '" is dead node (cant have children), not adding parent for node "' + str(self.name) + '"' ) return assert type(parent) is MultiTreeNode if parent.name in self.parent_names: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For node "' + str(self.name) + '" parent "' + str(parent.name) + '" already exists' ) else: # Don't add if already exists as parent, anywhere higher up the tree hierarchy if self.is_higher_level(node=parent, supposed_child_node=self): return # Update for both parent and child self.parents.append(parent) self.update() parent.children.append(self) parent.update() Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For node "' + str(self.name) + '" successfully added parent "' + str(parent.name) + '"' )
def check_if_model_updated(self): updated_time = os.path.getmtime(self.fpath_updated_file) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model identifier "' + str(self.identifier_string) + '" last updated time ' + str(self.model_updated_time) + ', updated "' + str(updated_time) + '".') if (updated_time > self.model_updated_time): Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model update time for identifier "' + str(self.identifier_string) + '" - "' + str(datetime.fromtimestamp(updated_time)) + '" is newer than "' + str(datetime.fromtimestamp(self.model_updated_time)) + '". Reloading model...') try: self.mutex_training.acquire() # Reset model flags to not ready self.model_loaded = False self.model_updated_time = updated_time finally: self.mutex_training.release() return True else: return False
def __init__(self, lang=LangFeatures.LANG_KO, audio_source=SOURCE_MIC, audio_file=None, engine=ENGINE_GOOGLE, auth_info=None): self.lang = lang self.audio_source = audio_source self.audio_file = audio_file self.engine = engine self.auth_info = auth_info Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Language "' + str(self.lang) + '" audio source "' + str(self.audio_source) + '"') if self.audio_source == self.SOURCE_MIC: self.sr_source = sr.Microphone() elif self.audio_source == self.SOURCE_FILE: self.sr_source = sr.AudioFile(audio_file) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported audio source "' + str(self.audio_source) + '"') return
def __init__(self, lang): self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) self.raw_words = None self.common_words = None lfobj = LangFeatures() self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.') self.word_stemmer = None if self.lang_have_verb_conj: try: self.word_stemmer = Lemmatizer(lang=self.lang) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.') except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.word_stemmer = None return
def __load_sample_model(self, embed_input_dim=1000, embed_output_dim=64, embed_input_len=20, lstm_units=128): # # Layers Design # lstm_model = keras.Sequential() # Add an Embedding layer expecting input vocab of size 1000, and # output embedding dimension of size 64. lstm_model.add( keras.layers.Embedding(input_dim=embed_input_dim, output_dim=embed_output_dim, input_length=embed_input_len)) # Add a LSTM layer with 128 internal units. lstm_model.add(keras.layers.LSTM(lstm_units)) # Add a Dense layer with 10 units and softmax activation. lstm_model.add(keras.layers.Dense(10, activation='softmax')) # Finally compile the model lstm_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model compiled successfully.') lstm_model.summary() return lstm_model
def run(self): try: self.__mutex_training.acquire() self.bot_training_start_time = dt.datetime.now() self.log_training = [] self.__pre_process_training_data() self.train() self.bot_training_end_time = dt.datetime.now() except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Training Identifier ' + str(self.identifier_string) + '" training exception: ' + str(ex) + '.' Log.critical(errmsg) raise Exception(errmsg) finally: self.is_training_done = True self.__mutex_training.release() Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Train mode "' + str(self.train_mode) + '". Training Identifier ' + str(self.identifier_string) + '" trained successfully.' ) return self.log_training
def check_prediction_stats( self, X, Y, y_predicted, ): Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Checking prediction stats..') # print(y_predicted) # print(type(y_predicted)) # print(y_predicted.shape) # print(np.sum(y_predicted, axis=1).tolist()) # Compare some data count_correct = 0 for i in range(X.shape[0]): data_i = X[i] label_i = Y[i] prob_distribution = y_predicted[i] top_x = NumpyUtil.get_top_indexes(data=prob_distribution, ascending=False, top_x=5) if top_x[0] == label_i: count_correct += 1 Log.debug( str(i) + '. ' + str(data_i) + ': Label=' + str(label_i) + ', predicted=' + str(top_x)) Log.important('Boosting Accuracy = ' + str(100 * count_correct / X.shape[0]) + '%.') return
def __recognize_file(self): need_convert_format = re.sub(pattern='(.*[.])([a-zA-Z0-9]+$)', repl='\\2', string=self.audio_file).lower() != 'wav' audio_filepath_wav = self.audio_file if need_convert_format: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converting "' + str(self.audio_file) + '" to wav format..') audio_filepath_wav = AudioUtils().convert_format( filepath=self.audio_file) # Initialize recognizer class (for recognizing the speech) r = sr.Recognizer() # Reading Audio file as source # listening the audio file and store in audio_text variable with sr.AudioFile(audio_filepath_wav) as source: audio_text = r.listen(source) # recoginize_() method will throw a request error if the API is unreachable, hence using exception handling try: if self.engine == SpeechRecognition.ENGINE_GOOGLE: text = r.recognize_google(audio_text, language=self.lang) elif self.engine == SpeechRecognition.ENGINE_GOOGLE_CLOUD: text = r.recognize_google_cloud( audio_text, credentials_json=self.auth_info, language=self.lang) elif self.engine == SpeechRecognition.ENGINE_BING: text = r.recognize_bing(audio_text, key=self.auth_info, language=self.lang) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsuported engine "' + str(self.engine) + '".') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converting audio transcripts into text ...') Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Recognized "' + str(self.lang) + '" text "' + str(text) + '" from audio file "' + str(self.audio_file) + '"') return text except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception converting audio transcript from "' + str(self.audio_file) + '": ' + str(ex))
def add_intent_name_to_training_data(self): # # We need to add intent name into the training data also # df_intent_id_name = pd.DataFrame({ DaehuaTrainDataModel.COL_TDATA_INTENT_ID: self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_ID], DaehuaTrainDataModel.COL_TDATA_INTENT_NAME: self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_NAME] }) # Make unique by dropping duplicate intent IDs df_intent_id_name.drop_duplicates(inplace=True) for idx in df_intent_id_name.index: intId = df_intent_id_name[ DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx] try: int_name = str(df_intent_id_name[ DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx]) # Arguments be a list form, otherwise will not be able to create this DataFrame row_to_append = pd.DataFrame( data=self.__get_row_to_append_to_training_data( intent_id=[intId], intent_name=[int_name], text=[int_name], text_id=[TrDataPreprocessor.TRDATA_ID_INTENT_NAME], # Make sure to write back this value with processed text processed_text=[None], lang_detected=[None], internal_counter=[self.df_training_data.shape[0]])) # # We are appending to a dataframe that might have different columns ordering # So we make sure they are in the same order, to avoid all the sort=False/True # warning messages by pandas due to required join() operation. # If in same order, then we avoid the join(). # self.df_training_data = self.df_training_data.append( row_to_append, sort=True) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Appended intent name "' + str(int_name) + '" with intent ID ' + str(intId) + ' to list of training data. Row appended = ' + str(row_to_append)) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Could not append to dataframe or could not get intent name for intent ID ' \ + str(intId) + '. Exception ' + str(ex) Log.warning(errmsg) raise Exception(errmsg) self.__process_training_data_index() return self.df_training_data
def __init__( self ): self.lang_features = LangFeatures() # Map alphabet name to unicode character set array self.alphabet_dict = {} for alp in self.TESTS_BY_ORDER: self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset( alphabet = alp ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabets used: ' + str(self.alphabet_dict.keys()) ) self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator() Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep)) # Load common words self.common_words = {} self.common_words[LangFeatures.LANG_EN] = English() self.common_words[LangFeatures.LANG_ES] = Spanish() self.common_words[LangFeatures.LANG_FR] = French() self.common_words[LangFeatures.LANG_ID] = Indonesian() self.common_words[LangFeatures.LANG_VI] = Vietnamese() # Load stemmers self.word_stemmer = {} for lang in self.SUPPORTED_LANGS: lang_have_verb_conj = self.lang_features.have_verb_conjugation( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.' ) self.word_stemmer[lang] = None if lang_have_verb_conj: try: self.word_stemmer[lang] = Lemmatizer( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__)) return
def reset_fields_to_incomplete(self): Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Reset form fields to incomplete for form ' + str(self.to_json())) for i in range(len(self.form_fields)): fld = self.form_fields[i] fld.completed = False fld.value = None
def join(self, timeout=None): Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model Identifier "' + str(self.identifier_string) + '" join called..') self.stoprequest.set() super(ModelInterface, self).join(timeout=timeout) Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model Identifier "' + str(self.identifier_string) + '" Background Thread ended..')
def reset(self): Log.important('Form reset') self.set_state_none() # The current field we are trying to extract from user self.conv_current_field_index = None self.conv_current_field_name = None # Previous field set by user # self.conv_completed_fields = [] # Reset fields self.form.reset_fields_to_incomplete() return
def preprocess_training_data(self): if not self.is_training_data_ready: try: # # The external interface must pass back 2 parameters, a DataFrame of preprocessed training data # and Embedding Layer params # self.df_training_data_pp, self.embedding_params = self.training_data_source.fetch_and_preprocess_data( ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successfully preprocessed training data. Max label val = ' + str(self.embedding_params.max_label_val) + ', max sentence length = ' + str(self.embedding_params.max_sent_len) + ', vocabulary size = ' + str(self.embedding_params.vocab_size) + ', x one hot dict: ' + str(self.embedding_params.x_one_hot_dict)) self.training_data = TextTrainer.convert_preprocessed_text_to_training_data_model( model_name=self.model_name, training_dataframe=self.df_training_data_pp, embedding_x=self.embedding_params.x, embedding_y=self.embedding_params.y, embedding_x_one_hot_dict=self.embedding_params. x_one_hot_dict, embedding_y_one_hot_dict=self.embedding_params. y_one_hot_dict, word_freq_model=self.word_freq_model, ) except Exception as ex: errmsg = \ str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Exception calling external object type "' + str(type(self.training_data_source)) \ + '" method fetch_and_preprocess_data(), exception msg: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) if type(self.training_data) is not tdm.TrainingDataModel: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.identifier_string) + '": Wrong training data type "' + str(type(self.training_data)) + '".') # Train a single y/label ID only, regardless of train mode if self.y_id is not None: # Filter by this y/label only self.training_data.filter_by_y_id(y_id=self.y_id) return
def persist_model_to_storage(self): prf_start = prf.Profiling.start() self.model_data.persist_model_to_storage( log_training=self.logs_training) if self.do_profiling: Log.important(str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ' PROFILING persist_model_to_storage(): ' + prf.Profiling.get_time_dif_str( prf_start, prf.Profiling.stop()), log_list=self.logs_training) return
def run_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) res = self.test_textcluster_english() res_final.update(other_res_obj=res) res = self.test_textcluster_chinese() res_final.update(other_res_obj=res) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': PASSED ' + str(res_final.count_ok) + ', FAILED ' + str(res_final.count_fail)) return res_final
def check( self, # Text array e.g. ['это', 'мое', 'предложение'] text_segmented_arr, max_cost=1): start_prf = prf.Profiling.start() # # 1. Правописание Отдельных Слов Без Контекста. # len_text = len(text_segmented_arr) corrected_text_arr = [] # Get the list of words in the model for i in range(len_text): w = text_segmented_arr[i] if (w is None) or (len(w) == 0): continue w_corrected = w if w not in self.words_list: df_correction_matches = self.spell_check_word.search_close_words( word=w, max_cost=max_cost) # Забрать первое слово с максимальным весом if df_correction_matches is not None: # В случае индексы не в порядке top_loc = df_correction_matches.index[0] w_corrected = df_correction_matches[ SpellCheckWord.COL_CORRECTED_WORD].loc[top_loc] corrected_text_arr.append(w_corrected) # # 2. Правописание Предложения с Контекстом # # TODO if self.do_profiling: ms = 1000 * prf.Profiling.get_time_dif_secs( start=start_prf, stop=prf.Profiling.stop()) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Spelling correction for ' + str(text_segmented_arr) + ' to ' + str(corrected_text_arr) + ' took ' + str(round(ms, 2)) + 'ms') return corrected_text_arr
def train( self, X, Y ): # Defining the size of the embedding embed_size = 2 # Defining the neural network inp = Input(shape=(X.shape[1],)) Log.debug('Input shape: ' + str(X.shape)) # Middle layer is the embedding vector we seek to extract # "linear" because this will serve as the word definition, to be input to other neural networks x = Dense(units=embed_size, activation='linear')(inp) # Standard softmax final layer x = Dense(units=Y.shape[1], activation='softmax')(x) model = Model(inputs=inp, outputs=x) Log.debug('Output shape: ' + str(Y.shape)) model.compile(loss='categorical_crossentropy', optimizer='adam') model.summary() # Optimizing the network weights model.fit( x=X, y=Y, batch_size=256, epochs=100 ) # Obtaining the weights from the neural network. # These are the so called word embeddings # The input layer (embedding weights) weights = model.get_weights()[0] Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Weights extracted as embedding layer: ' + str(weights) ) print(len(weights)) # Creating a dictionary to store the embeddings in. The key is a unique word and # the value is the numeric vector embedding_dict = {} for word in self.word_index_dict.keys(): embedding_dict.update({ word: weights[self.word_index_dict.get(word)] }) return embedding_dict
def fit_gradient_boosting( self, X_train, Y_train, num_class, feature_names, num_round=10, # boosting_model 'binary:logistic', classtype='multi:softprob', save_model_path=None, ): # Convert labels to categorical one-hot encoding labels_categorical = kerasutils.to_categorical(Y_train, num_classes=num_class) dtrain = self.convert_to_xgboost_data_format( data=X_train, labels=Y_train, feature_names=feature_names, ) param = { 'max_depth': 3, 'eta': 1, 'objective': classtype, 'num_class': num_class } param['nthread'] = 4 param['eval_metric'] = 'auc' # evallist = [(dtest, 'test')] self.model = xgb.train( param, dtrain, num_round, # evallist ) model_dump = self.model.get_dump() Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Boosting model class type "' + str(classtype) + '" trained successfully.' + ' Number of trees = ' + str(len(model_dump)) + ', Feature names: ' + str(self.model.feature_names)) if save_model_path is not None: pickle.dump(self.model, open(save_model_path, "wb")) return self.model
def run_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) from nwae.lang.config.Config import Config config = Config.get_cmdline_params_and_init_config_singleton( Derived_Class=Config, default_config_file=Config.CONFIG_FILE_PATH_DEFAULT) # DATAFRAME = pd.read_csv('task1.3.csv', sep=";") df = pd.read_csv( '/usr/local/git/nwae/nwae.lang/data/sample.intents.csv', sep=";") print(df) # from class 'pandas.core.series.Series' to class 'list' (here are all sents, same as in EXAMPLE_TEXTS): sents = pd.Series(df['sentence'], dtype="string").tolist() langs = pd.Series(df['lang'], dtype="string").tolist() print('Langs ' + str(langs)) tp = TextPreprscrAllLang( dir_wordlist=config.get_config( param=Config.PARAM_NLP_DIR_WORDLIST), postfix_wordlist=config.get_config( param=Config.PARAM_NLP_POSTFIX_WORDLIST), dir_app_wordlist=config.get_config( param=Config.PARAM_NLP_DIR_APP_WORDLIST), postfix_app_wordlist=config.get_config( param=Config.PARAM_NLP_POSTFIX_APP_WORDLIST), dir_synlist=config.get_config( param=Config.PARAM_NLP_DIR_SYNONYMLIST), postfix_synlist=config.get_config( param=Config.PARAM_NLP_POSTFIX_SYNONYMLIST), ) tp.preprocess_list_all_langs(sentences_list=sents) langs_detected = tp.detect_lang(sentences_list=sents) correct_pct, correct_count, total_count = self.get_stats_lang_detect( sentences_list=sents, langs_real=langs, langs_detected=langs_detected, ) Log.important('Method language detection. Correct ' + str(correct_pct) + '%, ' + str(correct_count) + '/' + str(total_count)) return res_final
def build_tree_roots(self): # Find root tree nodes self.tree_roots = {} for name in self.tree_nodes.keys(): Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Checking if ' + str(name) + ' is a tree root...' ) node = self.tree_nodes[name] if not node.is_dead_node(): if node.is_tree_root(): self.tree_roots[name] = node self.tree_roots_depth[name] = self.calculate_tree_depth(node=node) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Found ' + str(len(self.tree_roots)) + ' tree roots' ) return
def clean_nan_values( data, colnames, nan_string = DEFAULT_NAN_STRING ): # MUST convert column to string, so all NAs, N/As become string for name in colnames: # Create new copy col_series = np.array(data[name]) condition_null = np.array(data[name].isnull()) # Count nan rows count_nan = np.sum(condition_null * 1) Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Total ' + str(name) + ' rows with NULL = ' + str(count_nan) ) # Replace nan col_series[condition_null] = nan_string data[name] = col_series.astype(dtype=str) return data
def __run_lang_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) for txt_expected in UtTxtPreprocessor.TESTS[self.lang]: txt = txt_expected[0] expected = txt_expected[1] observed = self.txt_preprocessor.process_text( inputtext=txt, return_as_string=False, use_special_symbol_username_nonword=True) res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=observed, expected=expected, test_comment='test "' + str(txt) + '"')) Log.important('***** ' + str(self.lang) + ' PASSED ' + str(res_final.count_ok) + ', FAILED ' + str(res_final.count_fail) + ' *****') return res_final
def wait_for_model_to_be_ready(self, wait_max_time=10): # # Model reloaded without us knowing, e.g. user trained it, etc. # if self.model_last_reloaded_counter != self.model.get_model_reloaded_counter( ): Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + 'Model "' + str(self.identifier_string) + '" last counter ' + str(self.model_last_reloaded_counter) + ' not equal to model counter ' + str(self.model.get_model_reloaded_counter()) + '. Model updated, thus we must update our text processor.' ) # # Должен опять загрузить потому что класс TxtPreprocessor нужны данные из модели # self.load_text_processor() if self.model.is_model_ready(): return count = 1 sleep_time_wait_model = 0.1 while not self.model.is_model_ready(): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '" not yet ready, sleep for ' + str(count * sleep_time_wait_model) + ' secs now..') if count * sleep_time_wait_model > wait_max_time: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Waited for model "' + str(self.identifier_string)\ + '" too long ' + str(count * sleep_time_wait_model) + ' secs. Raising exception..' raise Exception(errmsg) time.sleep(sleep_time_wait_model) count = count + 1 Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '" READY.') return
def __process_training_data_index(self): # Sort by Intent ID and reset index self.df_training_data = self.df_training_data.sort_values( # By sorting also the internal counter, means we keep the original order within an # intent class, and the added Intent Name row will be last within the class [ DaehuaTrainDataModel.COL_TDATA_INTENT_ID, TrDataPreprocessor.TD_INTERNAL_COUNTER ], ascending=True) self.df_training_data = self.df_training_data.reset_index(drop=True) # Now derive the training data index Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Assigning numbers to training data based on intent...') # Add intent index self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_INDEX] =\ [0]*self.df_training_data.shape[0] prev_cat_int = '' prev_cat_int_index = 0 for i in range(0, self.df_training_data.shape[0], 1): cur_cat_int = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[i] if cur_cat_int != prev_cat_int: prev_cat_int = cur_cat_int prev_cat_int_index = 0 prev_cat_int_index = prev_cat_int_index + 1 self.df_training_data[ DaehuaTrainDataModel. COL_TDATA_INTENT_INDEX].at[i] = prev_cat_int_index Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': After process training data index, 10 Lines training data:\n\r' + str(self.df_training_data.columns) + '\n\r' + str(self.df_training_data[1:10].values) + '\n\r: Shape: ' + str(self.df_training_data.shape)) return
def run(self): Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model Identifier "' + str(self.identifier_string) + '" Background Thread started..') if not self.is_model_ready(): self.load_model_parameters() self.model_reload_counter += 1 sleep_time = 10 while True: if self.stoprequest.isSet(): Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model Identifier "' + str(self.identifier_string) + '" Breaking from forever thread...') break if self.check_if_model_updated(): try: self.__mutex_load_model.acquire() self.load_model_parameters() if not self.is_model_ready(): Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + self.identifier_string + '" failed to load. Try again in ' + str(sleep_time) + ' secs..') else: self.model_reload_counter += 1 finally: self.__mutex_load_model.release() time.sleep(sleep_time)
def get_model_file_prefix(dir_path_model, model_name, identifier_string, is_partial_training): # Prefix or dir prefix_or_dir = dir_path_model + '/' + model_name + '.' + identifier_string if is_partial_training: # Check if directory exists if not os.path.isdir(prefix_or_dir): Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Path "' + str(prefix_or_dir) + '" does not exist. Trying to create this directory...') try: os.mkdir(path=prefix_or_dir) Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Path "' + str(prefix_or_dir) + '" successfully created.') except Exception as ex: errmsg =\ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error creating directory "' + str(prefix_or_dir) + '". Exception ' + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg) return prefix_or_dir else: Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Using path prefix "' + str(prefix_or_dir) + '"') return prefix_or_dir