Beispiel #1
0
 def exit_gracefully(self, signum, frame):
     Log.critical(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) + ': Signal ' +
         str(signum) + ' received.')
     self.exit_callback()
     return
Beispiel #2
0
    def run(self):
        try:
            self.__mutex_training.acquire()
            self.bot_training_start_time = dt.datetime.now()
            self.log_training = []

            self.__pre_process_training_data()

            self.train()

            self.bot_training_end_time = dt.datetime.now()
        except Exception as ex:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Training Identifier ' + str(self.identifier_string) + '" training exception: ' + str(ex) + '.'
            Log.critical(errmsg)
            raise Exception(errmsg)
        finally:
            self.is_training_done = True
            self.__mutex_training.release()

        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Train mode "' + str(self.train_mode)
            + '". Training Identifier ' + str(self.identifier_string) + '" trained successfully.'
        )
        return self.log_training
Beispiel #3
0
    def run_unit_tests(self):
        res_final = uthelper.ResultObj(count_ok=0, count_fail=0)

        res = NwaeMlUnitTest(ut_params=self.ut_params).run_unit_tests()
        res_final.update(other_res_obj=res)
        Log.critical('Nwae ML Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        Log.critical('TOTAL PASS = '******', TOTAL FAIL = ' + str(res_final.count_fail))
        return res_final
Beispiel #4
0
    def run(self):
        try:
            self.wait_for_model_to_be_ready(wait_max_time=30)
        except Exception as ex:
            errmsg =\
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                + ': Waited 30secs for model to be ready but failed! ' + str(ex)
            Log.critical(errmsg)
            raise Exception(errmsg)

        self.load_text_processor()
        return
Beispiel #5
0
    def transform_input_for_model(
        self,
        # This should be a list of words as a sentence
        x_input,
        word_freq_model=FeatureVector.COL_FREQUENCY,
    ):
        #
        # This could be numbers, words, etc.
        #
        features_model = list(self.get_model_features())
        # Log.debug(
        #    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
        #    + ': Predicting v = ' + str(v_feature_segmented)
        #    + ' using model features:\n\r' + str(features_model)
        # )

        #
        # Convert sentence to a mathematical object (feature vector)
        #
        model_fv = FeatureVector()
        model_fv.set_freq_feature_vector_template(list_symbols=features_model)

        # Get feature vector of text
        try:
            df_fv = model_fv.get_freq_feature_vector(text_list=x_input)
        except Exception as ex:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                     + ': Exception occurred calculating FV for "' + str(x_input) \
                     + '": Exception "' + str(ex) \
                     + '\n\rUsing FV Template:\n\r' + str(model_fv.get_fv_template()) \
                     + ', FV Weights:\n\r' + str(model_fv.get_fv_weights())
            Log.critical(errmsg)
            raise Exception(errmsg)

        # This creates a single row matrix that needs to be transposed before matrix multiplications
        # ndmin=2 will force numpy to create a 2D matrix instead of a 1D vector
        # For now we make it 1D first
        assert word_freq_model in df_fv.columns, '"' + str(
            word_freq_model) + '" must be in ' + str(df_fv.columns)
        fv_text_1d = np.array(df_fv[word_freq_model].values, ndmin=1)
        if fv_text_1d.ndim != 1:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Expected a 1D vector, got ' + str(fv_text_1d.ndim) + 'D!')
        Log.debugdebug(fv_text_1d)

        x_transformed = npUtil.NumpyUtil.convert_dimension(arr=fv_text_1d,
                                                           to_dim=2)
        return x_transformed
Beispiel #6
0
    def train_from_partial_models(
            self,
            write_model_to_storage=True,
            write_training_data_to_storage=False,
            # Log training events
            logs=None):
        #
        # Load EIDF first
        # TODO How to ensure there are no missing words?
        #
        x_name = self.training_data.get_x_name()
        try:
            if type(logs) is list:
                self.logs_training = logs
            else:
                self.logs_training = []

            Log.info(str(self.__class__) + ' ' +
                     str(getframeinfo(currentframe()).lineno) +
                     ': Initializing IDF object.. try to read from file first',
                     log_list=self.logs_training)
            # Try to read from file
            df_eidf_file = eidf.Eidf.read_eidf_from_storage(
                dir_path_model=self.dir_path_model,
                identifier_string=self.identifier_string,
                x_name=x_name,
                log_training=self.logs_training)
            Log.debug(str(self.__class__) + ' ' +
                      str(getframeinfo(currentframe()).lineno) +
                      ': Successfully Read EIDF from file',
                      log_list=self.logs_training)
            self.model_data.idf = np.array(
                df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF])
        except Exception as ex_eidf:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': No EIDF from file available. Exception ' + str(ex_eidf)
            Log.critical(errmsg, log_list=self.logs_training)
            raise Exception(errmsg)

        # Standardize to at least 2-dimensional, easier when weighting x
        self.model_data.idf = npUtil.NumpyUtil.convert_dimension(
            arr=self.model_data.idf, to_dim=2)

        #
        # Combines
        #
        self.model_data.load_model_from_partial_trainings_data(
            td_latest=self.training_data, log_training=self.logs_training)
        return self.logs_training
Beispiel #7
0
    def __return_array_words_as_string(self, array_words):
        a = prf.Profiling.start()
        print_separator = BasicPreprocessor.get_word_separator(
            lang = self.lang
        )

        s = print_separator.join(array_words)

        if self.do_profiling:
            b = prf.Profiling.stop()
            Log.critical(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ':      PROFILING Segment Words for [' + text + '] to [' + s
                + '] took ' + prf.Profiling.get_time_dif_str(start=a, stop=b)
            )
        return s
Beispiel #8
0
    def load_model_parameters(self):
        prf_start = prf.Profiling.start()

        try:
            self.mutex_training.acquire()
            self.model_data.load_model_parameters_from_storage()
        except Exception as ex:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Failed to load model data for identifier "' + self.identifier_string\
                     + '". Exception message: ' + str(ex) + '.'
            Log.critical(errmsg)
            raise Exception(errmsg)
        finally:
            self.mutex_training.release()

        if self.do_profiling:
            Log.important(
                str(self.__class__) +
                str(getframeinfo(currentframe()).lineno) +
                ' PROFILING load_model_parameters_from_storage(): ' +
                prf.Profiling.get_time_dif_str(prf_start, prf.Profiling.stop())
            )
        return
Beispiel #9
0
    def load_training_data_from_storage(self):
        self.initialize_training_data_paths()
        try:
            df_td_x = pd.read_csv(
                filepath_or_buffer=self.fpath_training_data_x,
                sep=',',
                index_col='INDEX')
            df_td_x_name = pd.read_csv(
                filepath_or_buffer=self.fpath_training_data_x_name,
                sep=',',
                index_col='INDEX')
            df_td_y = pd.read_csv(
                filepath_or_buffer=self.fpath_training_data_y,
                sep=',',
                index_col='INDEX')

            td = tdm.TrainingDataModel(
                x=np.array(df_td_x.values),
                x_name=np.array(df_td_x_name.values).transpose()[0],
                # y is the index remember, the column is y_name
                y=np.array(df_td_y.index),
                y_name=np.array(df_td_y.values).transpose()[0],
            )
            Log.important(
                str(__name__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Training Data x read ' + str(df_td_x.shape) + ' shape' +
                ', x_name read ' + str(df_td_x_name.shape) + '\n\r' +
                str(td.get_x_name()) + ', y read ' + str(df_td_y.shape) +
                '\n\r' + str(td.get_y()))
            self.training_data = td
        except Exception as ex:
            errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Load training data from file failed for identifier "' + self.identifier_string\
                     + '". Error msg "' + str(ex) + '".'
            Log.critical(errmsg)
            raise Exception(errmsg)
Beispiel #10
0
    def safe_dataframe_write(df,
                             include_index,
                             index_label,
                             filepath,
                             name_df=None,
                             log_training=None):
        DEFAULT_CSV_SEPARATOR = ','
        #
        # Write to tmp file first
        #
        filepath_tmp = str(filepath) + '.tmp'
        # We backup the previous model file just in case
        filepath_old = ModelInterface.get_backup_filepath(filepath=filepath)

        try:
            df.to_csv(path_or_buf=filepath_tmp,
                      index=include_index,
                      index_label=index_label,
                      sep=DEFAULT_CSV_SEPARATOR)
            Log.info(str(__name__) + ' ' +
                     str(getframeinfo(currentframe()).lineno) +
                     ': TMP File: Saved "' + str(name_df) + '" with shape ' +
                     str(df.shape) + ' filepath "' + str(filepath_tmp) + '"',
                     log_list=log_training)
        except Exception as ex:
            errmsg =\
                str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                + ': TMP File: Could not create tmp "' + str(name_df)\
                + '" file "' + str(filepath_tmp) + '". ' + str(ex)
            Log.error(s=errmsg, log_list=log_training)
            raise Exception(errmsg)

        #
        # Now try to read it back
        #
        try:
            # Just take rows
            nrows_original = df.shape[0]
            df_read_back = pd.read_csv(filepath_or_buffer=filepath_tmp,
                                       sep=DEFAULT_CSV_SEPARATOR,
                                       index_col=index_label)
            if df_read_back.shape[0] == nrows_original:
                Log.important(
                    str(__name__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': TMP File: Successfully read back ' +
                    str(df_read_back.shape[0]) + ' rows of "' + str(name_df) +
                    '" file "' + str(filepath_tmp))
            else:
                raise Exception('Original rows = ' + str(nrows_original) +
                                ' but read back ' +
                                str(df_read_back.shape[0]) + '.')
        except Exception as ex:
            errmsg = \
                str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                + ': TMP File: Could not read back "' + str(name_df) + '" file "' + str(filepath_tmp)
            Log.critical(s=errmsg, log_list=log_training)
            raise Exception(errmsg)

        time.sleep(0.2)
        #
        # Finally rename the .tmp file
        #
        try:
            # If old model file exists, backup the file
            if os.path.isfile(filepath):
                os.rename(src=filepath, dst=filepath_old)
            Log.important(str(__name__) + ' ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': BACKUP File: Successfully backed up old model "' +
                          str(name_df) + '" to filepath "' +
                          str(filepath_old) + '"',
                          log_list=log_training)
            os.rename(src=filepath_tmp, dst=filepath)
            Log.important(str(__name__) + ' ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': REAL File: Successfully saved data frame "' +
                          str(name_df) + ' filepath "' + str(filepath) + '"',
                          log_list=log_training)
        except Exception as ex:
            errmsg =\
                str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                + ': REAL File: For object "' + str(name_df)\
                + '" could not rename tmp file "' + str(filepath_tmp)\
                + '" to file "' + str(filepath)\
                + '". ' + str(ex)
            Log.error(s=errmsg, log_list=log_training)
            raise Exception(errmsg)
Beispiel #11
0
    def load_text_processor(self):
        try:
            self.load_text_processor_mutex.acquire()
            # Don't allow to load again
            if self.model_last_reloaded_counter == self.model.get_model_reloaded_counter(
            ):
                Log.warning(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': Model "' +
                    str(self.identifier_string) +
                    '" not reloading PredictClassTxtProcessor.')
                return

            Log.info(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Model "' +
                str(self.model_name) +
                '" ready. Loading synonym & word lists..')

            self.lang_detect = LangDetect()

            self.predict_class_txt_processor = {}
            for uh in [self.lang_main] + self.lang_additional:
                try:
                    model_features_list = self.model.get_model_features(
                    ).tolist()
                except Exception as ex_feature_list:
                    Log.error(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': Model "' + str(self.model_name) + '" identifier "' +
                        str(self.identifier_string) +
                        '" model feature list empty!')
                    model_features_list = None

                self.predict_class_txt_processor[uh] = TxtPreprocessor(
                    identifier_string=self.identifier_string,
                    dir_path_model=self.dir_path_model,
                    model_features_list=model_features_list,
                    lang=uh,
                    dirpath_synonymlist=self.dirpath_synonymlist,
                    postfix_synonymlist=self.postfix_synonymlist,
                    dir_wordlist=self.dir_wordlist,
                    postfix_wordlist=self.postfix_wordlist,
                    dir_wordlist_app=self.dir_wordlist_app,
                    postfix_wordlist_app=self.postfix_wordlist_app,
                    # TODO For certain languages like English, it is essential to include this
                    #   But at the same time must be very careful. By adding manual rules, for
                    #   example we include words 'it', 'is'.. But "It is" could be a very valid
                    #   training data that becomes excluded wrongly.
                    stopwords_list=None,
                    do_spelling_correction=self.do_spelling_correction,
                    do_word_stemming=self.do_word_stemming,
                    do_profiling=self.do_profiling)

            self.is_all_initializations_done = True
            # Manually update this model last reloaded counter
            self.model_last_reloaded_counter = self.model.get_model_reloaded_counter(
            )
            Log.important(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Model name "' +
                str(self.model_name) + '", identifier "' +
                str(self.identifier_string) +
                '" All initializations done for model "' +
                str(self.identifier_string) + '". Model Reload counter = ' +
                str(self.model_last_reloaded_counter))
        except Exception as ex:
            errmsg = \
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                + ': Model name "' + str(self.model_name) \
                + '", identifier "' + str(self.identifier_string) \
                + '" Exception initializing synonym & word lists: ' + str(ex)
            Log.critical(errmsg)
            raise Exception(errmsg)
        finally:
            self.load_text_processor_mutex.release()
Beispiel #12
0
    def run_unit_tests(self):
        res_final = uthelper.ResultObj(count_ok=0, count_fail=0)

        res = UnitTestObjectPersistence(ut_params=None).run_unit_test()
        if res.count_fail > 0: raise Exception('Object Persistence failed: ' + str(res.count_fail))
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.utils>> Object Persistence Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = NwaeLangUnitTest(ut_params=self.ut_params).run_unit_tests()
        if res.count_fail > 0: raise Exception('nwae.lang failed: ' + str(res.count_fail))
        res_final.update(other_res_obj=res)
        # Log.critical('Project <<nwae.lang>> Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = NwaeMathUnitTest(ut_params=None).run_unit_tests()
        if res.count_fail > 0: raise Exception('nwae.math failed: ' + str(res.count_fail))
        res_final.update(other_res_obj=res)
        # Log.critical('Project <<nwae.math>>  Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = TrDataPreprocessorUnitTest(ut_params=self.ut_params).run_unit_test()
        if res.count_fail > 0: raise Exception('TD Data Preprocessor failed: ' + str(res.count_fail))
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.ml>> TD Data Preprocessor Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = TrainingDataModelUnitTest(ut_params=self.ut_params).run_unit_test()
        if res.count_fail > 0: raise Exception('Training Data Model failed: ' + str(res.count_fail))
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.ml>> Training Data Model Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = UnitTestMetricSpaceModel(
            ut_params  = self.ut_params,
            model_name = TextModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE
        ).run_unit_test()
        if res.count_fail > 0: raise Exception('MetricSpaceModel failed: ' + str(res.count_fail))
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.ml>> MetricSpaceModel Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = PredictClassUnitTest(ut_params=self.ut_params).run_unit_test()
        if res.count_fail > 0: raise Exception('PredictClass failed: ' + str(res.count_fail))
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.ml>> PredictClass Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        # try:
        #     # Try to import some Keras module to see if available
        #     from keras.utils import to_categorical
        #     test_nn_dense = True
        # except Exception as ex_load:
        #     errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
        #              + ': Could not test NN Dense Model: ' + str(ex_load)
        #     Log.error(errmsg)
        #     test_nn_dense = False
        # if test_nn_dense:
        #     from nwae.ml.nndense.NnDenseModelUnitTest import NnDenseModelUnitTest
        #     res = NnDenseModelUnitTest(
        #         ut_params = self.ut_params
        #     ).run_unit_test()
        #     if res.count_fail > 0: raise Exception('NN Dense Model failed: ' + str(res.count_fail))
        #     res_final.update(other_res_obj=res)
        #     Log.critical('<<nwae.ml>> NN Dense Model Unit Test PASSED ' + str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        Log.critical(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': PROJECT <<nwae.ml>> TOTAL PASS = '******', TOTAL FAIL = ' + str(res_final.count_fail)
        )
        return res_final
Beispiel #13
0
    def train(
        self,
        write_model_to_storage=True,
        write_training_data_to_storage=False,
        # Option to train a single y ID/label
        y_id=None,
        # To keep training logs here for caller's reference
        log_list_to_populate=None):
        prf_start = prf.Profiling.start()

        if self.training_data is None:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Cannot train without training data for identifier "' +
                self.identifier_string + '"')

        self.mutex_training.acquire()
        try:
            if type(log_list_to_populate) is list:
                self.logs_training = log_list_to_populate
            else:
                self.logs_training = []

            Log.important(str(self.__class__) + ' ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': Training for identifier=' +
                          self.identifier_string + ', y_id ' + str(y_id) +
                          '. Using key features remove quartile = ' +
                          str(self.key_features_remove_quartile) +
                          ', stop features = [' + str(self.stop_features) +
                          ']' + ', weigh by EIDF = ' + str(self.weigh_idf),
                          log_list=self.logs_training)

            #
            # Here training data must be prepared in the correct format already
            # Значит что множество свойств уже объединено как одно (unified features)
            #
            # Log.debugdebug(
            #     str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            #     + '\n\r\tTraining data:\n\r' + str(self.training_data.get_x().tolist())
            #     + '\n\r\tx names: ' + str(self.training_data.get_x_name())
            #     + '\n\r\ty labels: ' + str(self.training_data.get_y())
            # )

            #
            # Get IDF first
            # The function of these weights are nothing more than dimension reduction
            # TODO: IDF may not be the ideal weights, design an optimal one.
            #
            if self.weigh_idf:
                if MetricSpaceModel.USE_OPIMIZED_IDF:
                    try:
                        Log.info(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Initializing EIDF object.. try to read from file first',
                            log_list=self.logs_training)
                        # Try to read from file
                        df_eidf_file = eidf.Eidf.read_eidf_from_storage(
                            dir_path_model=self.dir_path_model,
                            identifier_string=self.identifier_string,
                            x_name=self.training_data.get_x_name())
                        Log.info(str(self.__class__) + ' ' +
                                 str(getframeinfo(currentframe()).lineno) +
                                 ': Successfully Read EIDF from file.',
                                 log_list=self.logs_training)
                        self.model_data.idf = np.array(
                            df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF])
                    except Exception as ex_eidf:
                        Log.critical(
                            str(self.__class__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': No EIDF from file available. Recalculating EIDF..',
                            log_list=self.logs_training)
                        idf_opt_obj = eidf.Eidf(
                            x=self.training_data.get_x(),
                            y=self.training_data.get_y(),
                            x_name=self.training_data.get_x_name())
                        idf_opt_obj.optimize(initial_w_as_standard_idf=True)
                        self.model_data.idf = idf_opt_obj.get_w()
                else:
                    # Sum x by class
                    self.model_data.idf = eidf.Eidf.get_feature_weight_idf_default(
                        x=self.training_data.get_x(),
                        y=self.training_data.get_y(),
                        x_name=self.training_data.get_x_name())
            else:
                self.model_data.idf = np.array(
                    [1.0] * self.training_data.get_x_name().shape[0],
                    dtype=float)

            # Standardize to at least 2-dimensional, easier when weighting x
            self.model_data.idf = npUtil.NumpyUtil.convert_dimension(
                arr=self.model_data.idf, to_dim=2)

            Log.debugdebug(str(self.__class__) + ' ' +
                           str(getframeinfo(currentframe()).lineno) +
                           '\n\r\tEIDF values:\n\r' + str(self.model_data.idf),
                           log_list=self.logs_training)

            #
            # Re-weigh again. This will change the x in self.training data
            #
            self.training_data.weigh_x(w=self.model_data.idf[0])

            #
            # Initizalize model data
            #
            # Refetch again after weigh
            x = self.training_data.get_x()
            y = self.training_data.get_y()
            self.model_data.x_name = self.training_data.get_x_name()

            # Unique y or classes
            # We do this again because after weighing, it will remove bad rows, which might cause some y
            # to disappear
            self.model_data.y_unique = np.array(list(set(y)))

            Log.debugdebug(str(self.__class__) + ' ' +
                           str(getframeinfo(currentframe()).lineno) +
                           '\n\r\tx weighted by idf and renormalized:\n\r' +
                           str(x.tolist()) + '\n\r\ty\n\r' + str(y) +
                           '\n\r\tx_name\n\r' + str(self.model_data.x_name),
                           log_list=self.logs_training)

            #
            # Get RFV for every command/intent, representative feature vectors by command type
            #

            # 1. Cluster training data of the same intent.
            #    Instead of a single RFV to represent a single intent, we should have multiple.
            xy_clstr = MetricSpaceModel.get_clusters(
                x=x,
                y=y,
                x_name=self.model_data.x_name,
                log_training=self.logs_training)
            self.model_data.x_clustered = xy_clstr.x_cluster
            self.model_data.y_clustered = xy_clstr.y_cluster
            self.model_data.y_clustered_radius = xy_clstr.y_cluster_radius

            #
            # RFV Derivation
            #
            m = np.zeros(
                (len(self.model_data.y_unique), len(self.model_data.x_name)))
            # Temporary only this data frame
            df_x_ref = pd.DataFrame(m,
                                    columns=self.model_data.x_name,
                                    index=list(self.model_data.y_unique))
            #print('***** y unique type: ' + str(type(self.model_data.y_unique)) + ', df_x_ref: '  + str(df_x_ref))
            self.model_data.df_y_ref_radius = pd.DataFrame(
                {
                    MetricSpaceModel.TERM_CLASS:
                    list(self.model_data.y_unique),
                    MetricSpaceModel.TERM_RADIUS:
                    [MetricSpaceModel.HPS_MAX_EUCL_DIST] *
                    len(self.model_data.y_unique),
                },
                index=list(self.model_data.y_unique))
            #print('***** df_x_ref: '  + str(self.model_data.df_y_ref_radius))

            #
            # Derive x_ref and y_ref
            #
            for cs in self.model_data.y_unique:
                Log.debug(str(self.__class__) + ' ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': Doing class [' + str(cs) + ']',
                          log_list=self.logs_training)
                # Extract class points
                class_points = x[y == cs]
                #
                # Reference feature vector for the command is the average of all feature vectors
                #
                rfv = np.sum(class_points, axis=0) / class_points.shape[0]
                # Renormalize it again
                # At this point we don't have to check if it is a 0 vector, etc. as it was already done in TrainingDataModel
                # after weighing process
                normalize_factor = np.sum(np.multiply(rfv, rfv))**0.5
                if normalize_factor < const.Constants.SMALL_VALUE:
                    raise Exception(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': Normalize factor for rfv in class "' + str(cs) +
                        '" is 0.')
                rfv = rfv / normalize_factor
                # A single array will be created as a column dataframe, thus we have to name the index and not columns
                df_x_ref.at[cs] = rfv

                check_normalized = np.sum(np.multiply(rfv, rfv))**0.5
                if abs(check_normalized - 1) > const.Constants.SMALL_VALUE:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                             + ': Warning! RFV for class [' + str(cs) + '] not 1, but [' + str(check_normalized) + '].'
                    Log.warning(errmsg, log_list=self.training_data)
                    raise Exception(errmsg)
                else:
                    Log.debug(str(self.__class__) + ' ' +
                              str(getframeinfo(currentframe()).lineno) +
                              ': Check RFV class "' + str(cs) +
                              '" normalized ok [' + str(check_normalized) +
                              '].',
                              log_list=self.logs_training)

                #
                # Get furthest point of classification to rfv
                # This will be used to accept or reject a classified point to a particular class,
                # once the nearest class is found (in which no class is found then).
                #
                # Minimum value of threshold, don't allow 0's
                radius_max = -1
                for i in range(0, class_points.shape[0], 1):
                    p = class_points[i]
                    dist_vec = rfv - p
                    dist = np.sum(np.multiply(dist_vec, dist_vec))**0.5
                    Log.debugdebug(str(self.__class__) + ' ' +
                                   str(getframeinfo(currentframe()).lineno) +
                                   '   Class ' + str(cs) + ' check point ' +
                                   str(i) + ', distance= ' + str(dist) +
                                   '. Point ' + str(class_points[i]) +
                                   ' with RFV ' + str(rfv),
                                   log_list=self.logs_training)
                    if dist > radius_max:
                        radius_max = dist
                        self.model_data.df_y_ref_radius[
                            MetricSpaceModel.TERM_RADIUS].at[cs] = dist

                Log.debug(str(self.__class__) + ' ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': Class "' + str(cs) + '". Max Radius = ' +
                          str(self.model_data.df_y_ref_radius[
                              MetricSpaceModel.TERM_RADIUS].loc[cs]),
                          log_list=self.logs_training)
            df_x_ref.sort_index(inplace=True)
            self.model_data.y_ref = np.array(df_x_ref.index)
            self.model_data.x_ref = np.array(df_x_ref.values)
            Log.debug('**************** ' + str(self.model_data.y_ref))

            if self.do_profiling:
                Log.important(str(self.__class__) +
                              str(getframeinfo(currentframe()).lineno) +
                              ' PROFILING train(): ' +
                              prf.Profiling.get_time_dif_str(
                                  prf_start, prf.Profiling.stop()),
                              log_list=self.logs_training)

            if write_model_to_storage:
                self.persist_model_to_storage()
            if write_training_data_to_storage or (self.is_partial_training):
                self.persist_training_data_to_storage(td=self.training_data)
        except Exception as ex:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Training exception for identifier "' + str(self.identifier_string) + '".'\
                     + ' Exception message ' + str(ex) + '.'
            Log.error(errmsg)
            raise ex
        finally:
            self.mutex_training.release()

        return self.logs_training
Beispiel #14
0
    def calc_proximity_class_score_to_point(
            self,
            # ndarray type of >= 2 dimensions, with 1 row (or 1st dimension length == 1)
            # This distance metric must be normalized to [0,1] already
            x_distance,
            y_label,
            top=modelIf.ModelInterface.MATCH_TOP):
        prf_start = prf.Profiling.start()

        if (type(x_distance) is not np.ndarray):
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Wrong type "' +
                type(x_distance) + '" to predict classes. Not ndarray.')

        if x_distance.ndim > 1:
            if x_distance.shape[0] != 1:
                raise Exception(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Expected x has only 1 row got c shape ' +
                    str(x_distance.shape) + '". x = ' + str(x_distance))
            else:
                x_distance = x_distance[0]

        # Log.debugdebug('x_distance: ' + str(x_distance) + ', y_label ' + str(y_label))

        # Theoretical Inequality check
        check_less_than_max = np.sum(
            1 * (x_distance > 1 + const.Constants.SMALL_VALUE))
        check_greater_than_min = np.sum(
            1 * (x_distance < 0 - const.Constants.SMALL_VALUE))

        if (check_less_than_max > 0) or (check_greater_than_min > 0):
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                     + ': Distance ' + str(x_distance) + ' fail theoretical inequality test.'
            Log.critical(errmsg)
            raise Exception(errmsg)

        # x_score = np.round(100 - x_distance_norm*100, 1)

        df_score = pd.DataFrame({
            MetricSpaceModel.TERM_CLASS:
            y_label,
            # MetricSpaceModel.TERM_SCORE: x_score,
            MetricSpaceModel.TERM_DIST:
            x_distance,
        })
        # Sort distances
        # df_score.sort_values(by=[MetricSpaceModel.TERM_DIST], ascending=True, inplace=True)
        # df_score = df_score[0:top]
        # df_score.reset_index(drop=True, inplace=True)
        # Log.debugdebug('DF SCORE 1:\n\r' + str(df_score))

        # Aggregate class by min distance, don't make class index.
        df_score = df_score.groupby(by=[MetricSpaceModel.TERM_CLASS],
                                    as_index=False,
                                    axis=0).min()
        # Warning! Uncomment only when debugging, this statement printing numpy array takes up to 10ms on Mac Air
        # Log.debugdebug('DF SCORE 2:\n\r' + str(df_score))

        # Put score last (because we need to do groupby().min() above, which will screw up the values
        # as score is in the reverse order with distances) and sort scores
        np_distnorm = np.array(df_score[MetricSpaceModel.TERM_DIST])
        score_vec = np.round(100 - np_distnorm * 100, 1)
        df_score[MetricSpaceModel.TERM_SCORE] = score_vec
        # Maximum confidence level is 5, minimum 0
        score_confidence_level_vec = \
            (score_vec >= self.confidence_level_scores[1]) * 1 + \
            (score_vec >= self.confidence_level_scores[2]) * 1 + \
            (score_vec >= self.confidence_level_scores[3]) * 1 + \
            (score_vec >= self.confidence_level_scores[4]) * 1 + \
            (score_vec >= self.confidence_level_scores[5]) * 1
        df_score[MetricSpaceModel.TERM_CONFIDENCE] = score_confidence_level_vec

        # Finally sort by Score
        df_score.sort_values(by=[MetricSpaceModel.TERM_SCORE],
                             ascending=False,
                             inplace=True)

        # Make sure indexes are conventional 0,1,2,...
        df_score = df_score[0:min(top, df_score.shape[0])]
        df_score.reset_index(drop=True, inplace=True)

        # Warning! Uncomment only when debugging, this statement printing numpy array takes up to 10ms on Mac Air
        #Log.debugdebug('x_score:\n\r' + str(df_score))

        if self.do_profiling:
            prf_dur = prf.Profiling.get_time_dif(prf_start,
                                                 prf.Profiling.stop())
            Log.important(
                str(self.__class__) +
                str(getframeinfo(currentframe()).lineno) +
                ' PROFILING calc_proximity_class_score_to_point(): ' +
                str(round(1000 * prf_dur, 0)) + ' milliseconds.')

        return df_score
Beispiel #15
0
    def run_unit_tests(self):
        res_final = uthelper.ResultObj(count_ok=0, count_fail=0)

        res = EncryptUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.utils>> Encrypt Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = NumpyUtilUnittest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.math>> Numpy Util Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = ClusterUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.math>> Cluster Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = HashUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.utils>> Hash Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = ObfuscateUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.math>> Obfuscate Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = RankUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.math>> Data Rank Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = GroupingUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.math>> Data Grouping Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = CenterOfMassUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.math>> Center of Mass Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = MultiTreeUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.math>> Multi Tree Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = HiddenMarkovUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.math>> Hidden Markov Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        Log.critical('PROJECT <<nwae.math>> TOTAL PASS = '******', TOTAL FAIL = ' +
                     str(res_final.count_fail))
        return res_final
Beispiel #16
0
    def run_unit_tests(self):
        res_final = uthelper.ResultObj(count_ok=0, count_fail=0)

        res = UnitTestMex(config=None).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Mex Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = LangFeaturesUnitTest(ut_params=None).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Language Features Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = LangCharactersUnitTest(ut_params=None).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Language Characters Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = LangDetectUnitTest(ut_params=None).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Language Detect Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = WordlistUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Wordlist Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = UnitTestWordSegmentation(
            ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Tokenizer Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = BasicPreprocessorUnitTest(
            ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Basic Preprocessor Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = UtTxtPreprocessor(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Preprocessor Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = EditDistanceUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical(
            '<<nwae.lang>> Edit Distance (DLev, Lev) Unit Test PASSED ' +
            str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = TrieNodeUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical(
            '<<nwae.lang>> TrieNode (Edit Distance) Unit Test PASSED ' +
            str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = SpellCheckWordUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Spell Check Word Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = SpellCheckSentenceUnitTest(
            ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Spell Check Sentence Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = FeatureVectorUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Feature Vector Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = WordFreqDocMatrixUnitTest(
            ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Word Freq Doc Matrix Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        res = TxtClusterUnitTest(ut_params=self.ut_params).run_unit_test()
        res_final.update(other_res_obj=res)
        Log.critical('<<nwae.lang>> Text Cluster Unit Test PASSED ' +
                     str(res.count_ok) + ', FAILED ' + str(res.count_fail))

        Log.critical('PROJECT <<nwae.lang>> TOTAL PASS = '******', TOTAL FAIL = ' +
                     str(res_final.count_fail))
        return res_final