def exit_gracefully(self, signum, frame): Log.critical( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Signal ' + str(signum) + ' received.') self.exit_callback() return
def run(self): try: self.__mutex_training.acquire() self.bot_training_start_time = dt.datetime.now() self.log_training = [] self.__pre_process_training_data() self.train() self.bot_training_end_time = dt.datetime.now() except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Training Identifier ' + str(self.identifier_string) + '" training exception: ' + str(ex) + '.' Log.critical(errmsg) raise Exception(errmsg) finally: self.is_training_done = True self.__mutex_training.release() Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Train mode "' + str(self.train_mode) + '". Training Identifier ' + str(self.identifier_string) + '" trained successfully.' ) return self.log_training
def run_unit_tests(self): res_final = uthelper.ResultObj(count_ok=0, count_fail=0) res = NwaeMlUnitTest(ut_params=self.ut_params).run_unit_tests() res_final.update(other_res_obj=res) Log.critical('Nwae ML Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) Log.critical('TOTAL PASS = '******', TOTAL FAIL = ' + str(res_final.count_fail)) return res_final
def run(self): try: self.wait_for_model_to_be_ready(wait_max_time=30) except Exception as ex: errmsg =\ str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Waited 30secs for model to be ready but failed! ' + str(ex) Log.critical(errmsg) raise Exception(errmsg) self.load_text_processor() return
def transform_input_for_model( self, # This should be a list of words as a sentence x_input, word_freq_model=FeatureVector.COL_FREQUENCY, ): # # This could be numbers, words, etc. # features_model = list(self.get_model_features()) # Log.debug( # str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) # + ': Predicting v = ' + str(v_feature_segmented) # + ' using model features:\n\r' + str(features_model) # ) # # Convert sentence to a mathematical object (feature vector) # model_fv = FeatureVector() model_fv.set_freq_feature_vector_template(list_symbols=features_model) # Get feature vector of text try: df_fv = model_fv.get_freq_feature_vector(text_list=x_input) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Exception occurred calculating FV for "' + str(x_input) \ + '": Exception "' + str(ex) \ + '\n\rUsing FV Template:\n\r' + str(model_fv.get_fv_template()) \ + ', FV Weights:\n\r' + str(model_fv.get_fv_weights()) Log.critical(errmsg) raise Exception(errmsg) # This creates a single row matrix that needs to be transposed before matrix multiplications # ndmin=2 will force numpy to create a 2D matrix instead of a 1D vector # For now we make it 1D first assert word_freq_model in df_fv.columns, '"' + str( word_freq_model) + '" must be in ' + str(df_fv.columns) fv_text_1d = np.array(df_fv[word_freq_model].values, ndmin=1) if fv_text_1d.ndim != 1: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Expected a 1D vector, got ' + str(fv_text_1d.ndim) + 'D!') Log.debugdebug(fv_text_1d) x_transformed = npUtil.NumpyUtil.convert_dimension(arr=fv_text_1d, to_dim=2) return x_transformed
def train_from_partial_models( self, write_model_to_storage=True, write_training_data_to_storage=False, # Log training events logs=None): # # Load EIDF first # TODO How to ensure there are no missing words? # x_name = self.training_data.get_x_name() try: if type(logs) is list: self.logs_training = logs else: self.logs_training = [] Log.info(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initializing IDF object.. try to read from file first', log_list=self.logs_training) # Try to read from file df_eidf_file = eidf.Eidf.read_eidf_from_storage( dir_path_model=self.dir_path_model, identifier_string=self.identifier_string, x_name=x_name, log_training=self.logs_training) Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successfully Read EIDF from file', log_list=self.logs_training) self.model_data.idf = np.array( df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF]) except Exception as ex_eidf: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': No EIDF from file available. Exception ' + str(ex_eidf) Log.critical(errmsg, log_list=self.logs_training) raise Exception(errmsg) # Standardize to at least 2-dimensional, easier when weighting x self.model_data.idf = npUtil.NumpyUtil.convert_dimension( arr=self.model_data.idf, to_dim=2) # # Combines # self.model_data.load_model_from_partial_trainings_data( td_latest=self.training_data, log_training=self.logs_training) return self.logs_training
def __return_array_words_as_string(self, array_words): a = prf.Profiling.start() print_separator = BasicPreprocessor.get_word_separator( lang = self.lang ) s = print_separator.join(array_words) if self.do_profiling: b = prf.Profiling.stop() Log.critical( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': PROFILING Segment Words for [' + text + '] to [' + s + '] took ' + prf.Profiling.get_time_dif_str(start=a, stop=b) ) return s
def load_model_parameters(self): prf_start = prf.Profiling.start() try: self.mutex_training.acquire() self.model_data.load_model_parameters_from_storage() except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Failed to load model data for identifier "' + self.identifier_string\ + '". Exception message: ' + str(ex) + '.' Log.critical(errmsg) raise Exception(errmsg) finally: self.mutex_training.release() if self.do_profiling: Log.important( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ' PROFILING load_model_parameters_from_storage(): ' + prf.Profiling.get_time_dif_str(prf_start, prf.Profiling.stop()) ) return
def load_training_data_from_storage(self): self.initialize_training_data_paths() try: df_td_x = pd.read_csv( filepath_or_buffer=self.fpath_training_data_x, sep=',', index_col='INDEX') df_td_x_name = pd.read_csv( filepath_or_buffer=self.fpath_training_data_x_name, sep=',', index_col='INDEX') df_td_y = pd.read_csv( filepath_or_buffer=self.fpath_training_data_y, sep=',', index_col='INDEX') td = tdm.TrainingDataModel( x=np.array(df_td_x.values), x_name=np.array(df_td_x_name.values).transpose()[0], # y is the index remember, the column is y_name y=np.array(df_td_y.index), y_name=np.array(df_td_y.values).transpose()[0], ) Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Training Data x read ' + str(df_td_x.shape) + ' shape' + ', x_name read ' + str(df_td_x_name.shape) + '\n\r' + str(td.get_x_name()) + ', y read ' + str(df_td_y.shape) + '\n\r' + str(td.get_y())) self.training_data = td except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Load training data from file failed for identifier "' + self.identifier_string\ + '". Error msg "' + str(ex) + '".' Log.critical(errmsg) raise Exception(errmsg)
def safe_dataframe_write(df, include_index, index_label, filepath, name_df=None, log_training=None): DEFAULT_CSV_SEPARATOR = ',' # # Write to tmp file first # filepath_tmp = str(filepath) + '.tmp' # We backup the previous model file just in case filepath_old = ModelInterface.get_backup_filepath(filepath=filepath) try: df.to_csv(path_or_buf=filepath_tmp, index=include_index, index_label=index_label, sep=DEFAULT_CSV_SEPARATOR) Log.info(str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': TMP File: Saved "' + str(name_df) + '" with shape ' + str(df.shape) + ' filepath "' + str(filepath_tmp) + '"', log_list=log_training) except Exception as ex: errmsg =\ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': TMP File: Could not create tmp "' + str(name_df)\ + '" file "' + str(filepath_tmp) + '". ' + str(ex) Log.error(s=errmsg, log_list=log_training) raise Exception(errmsg) # # Now try to read it back # try: # Just take rows nrows_original = df.shape[0] df_read_back = pd.read_csv(filepath_or_buffer=filepath_tmp, sep=DEFAULT_CSV_SEPARATOR, index_col=index_label) if df_read_back.shape[0] == nrows_original: Log.important( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': TMP File: Successfully read back ' + str(df_read_back.shape[0]) + ' rows of "' + str(name_df) + '" file "' + str(filepath_tmp)) else: raise Exception('Original rows = ' + str(nrows_original) + ' but read back ' + str(df_read_back.shape[0]) + '.') except Exception as ex: errmsg = \ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': TMP File: Could not read back "' + str(name_df) + '" file "' + str(filepath_tmp) Log.critical(s=errmsg, log_list=log_training) raise Exception(errmsg) time.sleep(0.2) # # Finally rename the .tmp file # try: # If old model file exists, backup the file if os.path.isfile(filepath): os.rename(src=filepath, dst=filepath_old) Log.important(str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': BACKUP File: Successfully backed up old model "' + str(name_df) + '" to filepath "' + str(filepath_old) + '"', log_list=log_training) os.rename(src=filepath_tmp, dst=filepath) Log.important(str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': REAL File: Successfully saved data frame "' + str(name_df) + ' filepath "' + str(filepath) + '"', log_list=log_training) except Exception as ex: errmsg =\ str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': REAL File: For object "' + str(name_df)\ + '" could not rename tmp file "' + str(filepath_tmp)\ + '" to file "' + str(filepath)\ + '". ' + str(ex) Log.error(s=errmsg, log_list=log_training) raise Exception(errmsg)
def load_text_processor(self): try: self.load_text_processor_mutex.acquire() # Don't allow to load again if self.model_last_reloaded_counter == self.model.get_model_reloaded_counter( ): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '" not reloading PredictClassTxtProcessor.') return Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.model_name) + '" ready. Loading synonym & word lists..') self.lang_detect = LangDetect() self.predict_class_txt_processor = {} for uh in [self.lang_main] + self.lang_additional: try: model_features_list = self.model.get_model_features( ).tolist() except Exception as ex_feature_list: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.model_name) + '" identifier "' + str(self.identifier_string) + '" model feature list empty!') model_features_list = None self.predict_class_txt_processor[uh] = TxtPreprocessor( identifier_string=self.identifier_string, dir_path_model=self.dir_path_model, model_features_list=model_features_list, lang=uh, dirpath_synonymlist=self.dirpath_synonymlist, postfix_synonymlist=self.postfix_synonymlist, dir_wordlist=self.dir_wordlist, postfix_wordlist=self.postfix_wordlist, dir_wordlist_app=self.dir_wordlist_app, postfix_wordlist_app=self.postfix_wordlist_app, # TODO For certain languages like English, it is essential to include this # But at the same time must be very careful. By adding manual rules, for # example we include words 'it', 'is'.. But "It is" could be a very valid # training data that becomes excluded wrongly. stopwords_list=None, do_spelling_correction=self.do_spelling_correction, do_word_stemming=self.do_word_stemming, do_profiling=self.do_profiling) self.is_all_initializations_done = True # Manually update this model last reloaded counter self.model_last_reloaded_counter = self.model.get_model_reloaded_counter( ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model name "' + str(self.model_name) + '", identifier "' + str(self.identifier_string) + '" All initializations done for model "' + str(self.identifier_string) + '". Model Reload counter = ' + str(self.model_last_reloaded_counter)) except Exception as ex: errmsg = \ str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Model name "' + str(self.model_name) \ + '", identifier "' + str(self.identifier_string) \ + '" Exception initializing synonym & word lists: ' + str(ex) Log.critical(errmsg) raise Exception(errmsg) finally: self.load_text_processor_mutex.release()
def run_unit_tests(self): res_final = uthelper.ResultObj(count_ok=0, count_fail=0) res = UnitTestObjectPersistence(ut_params=None).run_unit_test() if res.count_fail > 0: raise Exception('Object Persistence failed: ' + str(res.count_fail)) res_final.update(other_res_obj=res) Log.critical('<<nwae.utils>> Object Persistence Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = NwaeLangUnitTest(ut_params=self.ut_params).run_unit_tests() if res.count_fail > 0: raise Exception('nwae.lang failed: ' + str(res.count_fail)) res_final.update(other_res_obj=res) # Log.critical('Project <<nwae.lang>> Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = NwaeMathUnitTest(ut_params=None).run_unit_tests() if res.count_fail > 0: raise Exception('nwae.math failed: ' + str(res.count_fail)) res_final.update(other_res_obj=res) # Log.critical('Project <<nwae.math>> Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = TrDataPreprocessorUnitTest(ut_params=self.ut_params).run_unit_test() if res.count_fail > 0: raise Exception('TD Data Preprocessor failed: ' + str(res.count_fail)) res_final.update(other_res_obj=res) Log.critical('<<nwae.ml>> TD Data Preprocessor Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = TrainingDataModelUnitTest(ut_params=self.ut_params).run_unit_test() if res.count_fail > 0: raise Exception('Training Data Model failed: ' + str(res.count_fail)) res_final.update(other_res_obj=res) Log.critical('<<nwae.ml>> Training Data Model Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = UnitTestMetricSpaceModel( ut_params = self.ut_params, model_name = TextModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE ).run_unit_test() if res.count_fail > 0: raise Exception('MetricSpaceModel failed: ' + str(res.count_fail)) res_final.update(other_res_obj=res) Log.critical('<<nwae.ml>> MetricSpaceModel Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = PredictClassUnitTest(ut_params=self.ut_params).run_unit_test() if res.count_fail > 0: raise Exception('PredictClass failed: ' + str(res.count_fail)) res_final.update(other_res_obj=res) Log.critical('<<nwae.ml>> PredictClass Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) # try: # # Try to import some Keras module to see if available # from keras.utils import to_categorical # test_nn_dense = True # except Exception as ex_load: # errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ # + ': Could not test NN Dense Model: ' + str(ex_load) # Log.error(errmsg) # test_nn_dense = False # if test_nn_dense: # from nwae.ml.nndense.NnDenseModelUnitTest import NnDenseModelUnitTest # res = NnDenseModelUnitTest( # ut_params = self.ut_params # ).run_unit_test() # if res.count_fail > 0: raise Exception('NN Dense Model failed: ' + str(res.count_fail)) # res_final.update(other_res_obj=res) # Log.critical('<<nwae.ml>> NN Dense Model Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) Log.critical( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': PROJECT <<nwae.ml>> TOTAL PASS = '******', TOTAL FAIL = ' + str(res_final.count_fail) ) return res_final
def train( self, write_model_to_storage=True, write_training_data_to_storage=False, # Option to train a single y ID/label y_id=None, # To keep training logs here for caller's reference log_list_to_populate=None): prf_start = prf.Profiling.start() if self.training_data is None: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot train without training data for identifier "' + self.identifier_string + '"') self.mutex_training.acquire() try: if type(log_list_to_populate) is list: self.logs_training = log_list_to_populate else: self.logs_training = [] Log.important(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Training for identifier=' + self.identifier_string + ', y_id ' + str(y_id) + '. Using key features remove quartile = ' + str(self.key_features_remove_quartile) + ', stop features = [' + str(self.stop_features) + ']' + ', weigh by EIDF = ' + str(self.weigh_idf), log_list=self.logs_training) # # Here training data must be prepared in the correct format already # Значит что множество свойств уже объединено как одно (unified features) # # Log.debugdebug( # str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) # + '\n\r\tTraining data:\n\r' + str(self.training_data.get_x().tolist()) # + '\n\r\tx names: ' + str(self.training_data.get_x_name()) # + '\n\r\ty labels: ' + str(self.training_data.get_y()) # ) # # Get IDF first # The function of these weights are nothing more than dimension reduction # TODO: IDF may not be the ideal weights, design an optimal one. # if self.weigh_idf: if MetricSpaceModel.USE_OPIMIZED_IDF: try: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initializing EIDF object.. try to read from file first', log_list=self.logs_training) # Try to read from file df_eidf_file = eidf.Eidf.read_eidf_from_storage( dir_path_model=self.dir_path_model, identifier_string=self.identifier_string, x_name=self.training_data.get_x_name()) Log.info(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successfully Read EIDF from file.', log_list=self.logs_training) self.model_data.idf = np.array( df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF]) except Exception as ex_eidf: Log.critical( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No EIDF from file available. Recalculating EIDF..', log_list=self.logs_training) idf_opt_obj = eidf.Eidf( x=self.training_data.get_x(), y=self.training_data.get_y(), x_name=self.training_data.get_x_name()) idf_opt_obj.optimize(initial_w_as_standard_idf=True) self.model_data.idf = idf_opt_obj.get_w() else: # Sum x by class self.model_data.idf = eidf.Eidf.get_feature_weight_idf_default( x=self.training_data.get_x(), y=self.training_data.get_y(), x_name=self.training_data.get_x_name()) else: self.model_data.idf = np.array( [1.0] * self.training_data.get_x_name().shape[0], dtype=float) # Standardize to at least 2-dimensional, easier when weighting x self.model_data.idf = npUtil.NumpyUtil.convert_dimension( arr=self.model_data.idf, to_dim=2) Log.debugdebug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + '\n\r\tEIDF values:\n\r' + str(self.model_data.idf), log_list=self.logs_training) # # Re-weigh again. This will change the x in self.training data # self.training_data.weigh_x(w=self.model_data.idf[0]) # # Initizalize model data # # Refetch again after weigh x = self.training_data.get_x() y = self.training_data.get_y() self.model_data.x_name = self.training_data.get_x_name() # Unique y or classes # We do this again because after weighing, it will remove bad rows, which might cause some y # to disappear self.model_data.y_unique = np.array(list(set(y))) Log.debugdebug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + '\n\r\tx weighted by idf and renormalized:\n\r' + str(x.tolist()) + '\n\r\ty\n\r' + str(y) + '\n\r\tx_name\n\r' + str(self.model_data.x_name), log_list=self.logs_training) # # Get RFV for every command/intent, representative feature vectors by command type # # 1. Cluster training data of the same intent. # Instead of a single RFV to represent a single intent, we should have multiple. xy_clstr = MetricSpaceModel.get_clusters( x=x, y=y, x_name=self.model_data.x_name, log_training=self.logs_training) self.model_data.x_clustered = xy_clstr.x_cluster self.model_data.y_clustered = xy_clstr.y_cluster self.model_data.y_clustered_radius = xy_clstr.y_cluster_radius # # RFV Derivation # m = np.zeros( (len(self.model_data.y_unique), len(self.model_data.x_name))) # Temporary only this data frame df_x_ref = pd.DataFrame(m, columns=self.model_data.x_name, index=list(self.model_data.y_unique)) #print('***** y unique type: ' + str(type(self.model_data.y_unique)) + ', df_x_ref: ' + str(df_x_ref)) self.model_data.df_y_ref_radius = pd.DataFrame( { MetricSpaceModel.TERM_CLASS: list(self.model_data.y_unique), MetricSpaceModel.TERM_RADIUS: [MetricSpaceModel.HPS_MAX_EUCL_DIST] * len(self.model_data.y_unique), }, index=list(self.model_data.y_unique)) #print('***** df_x_ref: ' + str(self.model_data.df_y_ref_radius)) # # Derive x_ref and y_ref # for cs in self.model_data.y_unique: Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Doing class [' + str(cs) + ']', log_list=self.logs_training) # Extract class points class_points = x[y == cs] # # Reference feature vector for the command is the average of all feature vectors # rfv = np.sum(class_points, axis=0) / class_points.shape[0] # Renormalize it again # At this point we don't have to check if it is a 0 vector, etc. as it was already done in TrainingDataModel # after weighing process normalize_factor = np.sum(np.multiply(rfv, rfv))**0.5 if normalize_factor < const.Constants.SMALL_VALUE: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Normalize factor for rfv in class "' + str(cs) + '" is 0.') rfv = rfv / normalize_factor # A single array will be created as a column dataframe, thus we have to name the index and not columns df_x_ref.at[cs] = rfv check_normalized = np.sum(np.multiply(rfv, rfv))**0.5 if abs(check_normalized - 1) > const.Constants.SMALL_VALUE: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Warning! RFV for class [' + str(cs) + '] not 1, but [' + str(check_normalized) + '].' Log.warning(errmsg, log_list=self.training_data) raise Exception(errmsg) else: Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Check RFV class "' + str(cs) + '" normalized ok [' + str(check_normalized) + '].', log_list=self.logs_training) # # Get furthest point of classification to rfv # This will be used to accept or reject a classified point to a particular class, # once the nearest class is found (in which no class is found then). # # Minimum value of threshold, don't allow 0's radius_max = -1 for i in range(0, class_points.shape[0], 1): p = class_points[i] dist_vec = rfv - p dist = np.sum(np.multiply(dist_vec, dist_vec))**0.5 Log.debugdebug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ' Class ' + str(cs) + ' check point ' + str(i) + ', distance= ' + str(dist) + '. Point ' + str(class_points[i]) + ' with RFV ' + str(rfv), log_list=self.logs_training) if dist > radius_max: radius_max = dist self.model_data.df_y_ref_radius[ MetricSpaceModel.TERM_RADIUS].at[cs] = dist Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Class "' + str(cs) + '". Max Radius = ' + str(self.model_data.df_y_ref_radius[ MetricSpaceModel.TERM_RADIUS].loc[cs]), log_list=self.logs_training) df_x_ref.sort_index(inplace=True) self.model_data.y_ref = np.array(df_x_ref.index) self.model_data.x_ref = np.array(df_x_ref.values) Log.debug('**************** ' + str(self.model_data.y_ref)) if self.do_profiling: Log.important(str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ' PROFILING train(): ' + prf.Profiling.get_time_dif_str( prf_start, prf.Profiling.stop()), log_list=self.logs_training) if write_model_to_storage: self.persist_model_to_storage() if write_training_data_to_storage or (self.is_partial_training): self.persist_training_data_to_storage(td=self.training_data) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Training exception for identifier "' + str(self.identifier_string) + '".'\ + ' Exception message ' + str(ex) + '.' Log.error(errmsg) raise ex finally: self.mutex_training.release() return self.logs_training
def calc_proximity_class_score_to_point( self, # ndarray type of >= 2 dimensions, with 1 row (or 1st dimension length == 1) # This distance metric must be normalized to [0,1] already x_distance, y_label, top=modelIf.ModelInterface.MATCH_TOP): prf_start = prf.Profiling.start() if (type(x_distance) is not np.ndarray): raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Wrong type "' + type(x_distance) + '" to predict classes. Not ndarray.') if x_distance.ndim > 1: if x_distance.shape[0] != 1: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Expected x has only 1 row got c shape ' + str(x_distance.shape) + '". x = ' + str(x_distance)) else: x_distance = x_distance[0] # Log.debugdebug('x_distance: ' + str(x_distance) + ', y_label ' + str(y_label)) # Theoretical Inequality check check_less_than_max = np.sum( 1 * (x_distance > 1 + const.Constants.SMALL_VALUE)) check_greater_than_min = np.sum( 1 * (x_distance < 0 - const.Constants.SMALL_VALUE)) if (check_less_than_max > 0) or (check_greater_than_min > 0): errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Distance ' + str(x_distance) + ' fail theoretical inequality test.' Log.critical(errmsg) raise Exception(errmsg) # x_score = np.round(100 - x_distance_norm*100, 1) df_score = pd.DataFrame({ MetricSpaceModel.TERM_CLASS: y_label, # MetricSpaceModel.TERM_SCORE: x_score, MetricSpaceModel.TERM_DIST: x_distance, }) # Sort distances # df_score.sort_values(by=[MetricSpaceModel.TERM_DIST], ascending=True, inplace=True) # df_score = df_score[0:top] # df_score.reset_index(drop=True, inplace=True) # Log.debugdebug('DF SCORE 1:\n\r' + str(df_score)) # Aggregate class by min distance, don't make class index. df_score = df_score.groupby(by=[MetricSpaceModel.TERM_CLASS], as_index=False, axis=0).min() # Warning! Uncomment only when debugging, this statement printing numpy array takes up to 10ms on Mac Air # Log.debugdebug('DF SCORE 2:\n\r' + str(df_score)) # Put score last (because we need to do groupby().min() above, which will screw up the values # as score is in the reverse order with distances) and sort scores np_distnorm = np.array(df_score[MetricSpaceModel.TERM_DIST]) score_vec = np.round(100 - np_distnorm * 100, 1) df_score[MetricSpaceModel.TERM_SCORE] = score_vec # Maximum confidence level is 5, minimum 0 score_confidence_level_vec = \ (score_vec >= self.confidence_level_scores[1]) * 1 + \ (score_vec >= self.confidence_level_scores[2]) * 1 + \ (score_vec >= self.confidence_level_scores[3]) * 1 + \ (score_vec >= self.confidence_level_scores[4]) * 1 + \ (score_vec >= self.confidence_level_scores[5]) * 1 df_score[MetricSpaceModel.TERM_CONFIDENCE] = score_confidence_level_vec # Finally sort by Score df_score.sort_values(by=[MetricSpaceModel.TERM_SCORE], ascending=False, inplace=True) # Make sure indexes are conventional 0,1,2,... df_score = df_score[0:min(top, df_score.shape[0])] df_score.reset_index(drop=True, inplace=True) # Warning! Uncomment only when debugging, this statement printing numpy array takes up to 10ms on Mac Air #Log.debugdebug('x_score:\n\r' + str(df_score)) if self.do_profiling: prf_dur = prf.Profiling.get_time_dif(prf_start, prf.Profiling.stop()) Log.important( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ' PROFILING calc_proximity_class_score_to_point(): ' + str(round(1000 * prf_dur, 0)) + ' milliseconds.') return df_score
def run_unit_tests(self): res_final = uthelper.ResultObj(count_ok=0, count_fail=0) res = EncryptUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.utils>> Encrypt Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = NumpyUtilUnittest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.math>> Numpy Util Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = ClusterUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.math>> Cluster Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = HashUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.utils>> Hash Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = ObfuscateUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.math>> Obfuscate Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = RankUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.math>> Data Rank Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = GroupingUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.math>> Data Grouping Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = CenterOfMassUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.math>> Center of Mass Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = MultiTreeUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.math>> Multi Tree Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = HiddenMarkovUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.math>> Hidden Markov Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) Log.critical('PROJECT <<nwae.math>> TOTAL PASS = '******', TOTAL FAIL = ' + str(res_final.count_fail)) return res_final
def run_unit_tests(self): res_final = uthelper.ResultObj(count_ok=0, count_fail=0) res = UnitTestMex(config=None).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Mex Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = LangFeaturesUnitTest(ut_params=None).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Language Features Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = LangCharactersUnitTest(ut_params=None).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Language Characters Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = LangDetectUnitTest(ut_params=None).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Language Detect Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = WordlistUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Wordlist Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = UnitTestWordSegmentation( ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Tokenizer Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = BasicPreprocessorUnitTest( ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Basic Preprocessor Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = UtTxtPreprocessor(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Preprocessor Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = EditDistanceUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical( '<<nwae.lang>> Edit Distance (DLev, Lev) Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = TrieNodeUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical( '<<nwae.lang>> TrieNode (Edit Distance) Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = SpellCheckWordUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Spell Check Word Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = SpellCheckSentenceUnitTest( ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Spell Check Sentence Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = FeatureVectorUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Feature Vector Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = WordFreqDocMatrixUnitTest( ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Word Freq Doc Matrix Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) res = TxtClusterUnitTest(ut_params=self.ut_params).run_unit_test() res_final.update(other_res_obj=res) Log.critical('<<nwae.lang>> Text Cluster Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail)) Log.critical('PROJECT <<nwae.lang>> TOTAL PASS = '******', TOTAL FAIL = ' + str(res_final.count_fail)) return res_final