def decode(self, ciphertext): try: if self.cipher_mode == AES.MODE_EAX: cipher = AES.new(key=self.key, mode=self.cipher_mode, nonce=self.nonce) cipherbytes = b64decode(ciphertext.encode(self.text_encoding)) data = cipher.decrypt(cipherbytes) elif self.cipher_mode == AES.MODE_CBC: cipher = AES.new(key=self.key, mode=self.cipher_mode, iv=self.nonce) cipherbytes = b64decode(ciphertext.encode(self.text_encoding)) data = cipher.decrypt(cipherbytes) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Decrypted data length = ' + str(len(data)) + ', modulo 16 = ' + str(len(data) % 128 / 8)) # Remove last x bytes encoded in the padded bytes data = data[:-data[-1]] else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported mode "' + str(self.cipher_mode) + '".') return str(data, encoding=STR_ENCODING) except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error decoding data "' + str(ciphertext) + '" using AES ". Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg)
def check_if_model_updated(self): updated_time = os.path.getmtime(self.fpath_updated_file) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model identifier "' + str(self.identifier_string) + '" last updated time ' + str(self.model_updated_time) + ', updated "' + str(updated_time) + '".') if (updated_time > self.model_updated_time): Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model update time for identifier "' + str(self.identifier_string) + '" - "' + str(datetime.fromtimestamp(updated_time)) + '" is newer than "' + str(datetime.fromtimestamp(self.model_updated_time)) + '". Reloading model...') try: self.mutex_training.acquire() # Reset model flags to not ready self.model_loaded = False self.model_updated_time = updated_time finally: self.mutex_training.release() return True else: return False
def rank_sorted_list_by_unique_items(sorted_list): cntr = Counter(sorted_list) max_item_count = max(cntr.values()) Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Max unique item count = ' + str(max_item_count)) # # Another way of not getting the max item count above is to loop until no changes # occur to the ranking list. # However that will take up memory if the list given is huge, so we prefer knowing # in advance how many max loops to take # len_list = len(sorted_list) # Start with 1 rank item_rank = np.array([1] * len_list) for i in range(max_item_count): shift = i + 1 # Shift down sorted_list_shift = np.append(np.array(shift * [None]), sorted_list[0:(len_list - shift)], axis=0) Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ' Shift #' + str(shift) + ': ' + str(sorted_list_shift)) # If the rank is the previous rank and member code is the same, means we add 1 to the rank condition = (sorted_list == sorted_list_shift) & (item_rank == shift) item_rank[condition] = item_rank[condition] + 1 return item_rank.tolist()
def xor_string(self, s1, s2): Log.debug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': XOR between "' + str(s1) + '" and "' + str(s2) + '".') len_s1 = len(s1) len_s2 = len(s2) len_max = max(len(s1), len(s2)) # Append to the shorter one, in a repeat manner for i in range(len(s1), len_max, 1): s1 += s1[(i - len_s1)] for i in range(len(s2), len_max, 1): s2 += s2[(i - len_s2)] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': After appending, XOR between "' + str(s1) + '" and "' + str(s2) + '".') Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': s1 "' + str(s1) + '", s2 "' + str(s2) + '"') b1 = bytes(s1, encoding=Obfuscate.STRING_ENCODING) b2 = bytes(s2, encoding=Obfuscate.STRING_ENCODING) bytes_xor = self.xor_bytes(b1=b1, b2=b2) return bytes_xor
def __loss( self, # скрытые значения h, # наблюдаемые o, h_trns_prob_matrix, o_emis_prob_matrix, # для отладки info_i = -1, info_j = -1, ): assert len(h) == len(o) assert len(h) > 0 ml = 0 for i in range(1, len(h), 1): ml_part = - np.log(h_trns_prob_matrix[h[i-1], h[i]]) - np.log(o_emis_prob_matrix[h[i], o[i]]) if (h[i], h[i-1]) == (info_j,info_i): Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + '; p(' + str(info_i) + ',' + str(info_j) + ') = ' + str(h_trns_prob_matrix[h[i-1], h[i]]) ) ml += ml_part return ml
def filter_sentence_by_pos_tag_japanese( self, # string or word list word_list, keep_tags=DEFAULT_KEEP_TAGS_JAP, ): try: import nagisa except Exception as ex: raise Exception( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unable to load nagisa: ' + str(ex)) if type(word_list) in [list, tuple]: text = ' '.join(word_list) else: text = word_list words_postags_obj = nagisa.tagging(text) txt_sym_tok = words_postags_obj.words txt_sym_postags = words_postags_obj.postags Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Japanese segmentation ' + str(txt_sym_tok) + ', word & POS tags: ' + str(txt_sym_postags)) words_postags = list(zip(txt_sym_tok, txt_sym_postags)) sent_filtered = [w for w, t in words_postags if (t in keep_tags)] Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': POS TAGs: ' + str(words_postags)) Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Filtered sentence: ' + str(sent_filtered)) return sent_filtered
def set_feature_weights(self, fw): self.fv_weights = np.array(fw) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ' Feature weights set to ' + str(self.fv_weights) + '.' ) return
def verify_totp_style( self, # We test for <tolerance_secs> back tolerance_secs=30): now = datetime.now() try: for i in range(tolerance_secs): t_test = now - timedelta(seconds=i) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Trying ' + str(t_test.strftime('%Y-%m-%d %H:%M:%S'))) test_challenge_calc = AccessTokenSharedsecretChallenge.create_totp_style_challenge_response( shared_secret=self.shared_secret, datetime_val=t_test, algo_hash=self.algo_hash) res = self.__compare_test_challenge( test_challenge_calc=test_challenge_calc) if res == True: return res return False except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception for shared secret "' + str(self.shared_secret) + '", totp style test challenge "' + str(self.test_challenge) + '": ' + str(ex)) return False
def test_textcluster_english(self): res = ut.ResultObj(count_ok=0, count_fail=0) lang = lf.LangFeatures.LANG_EN # # We take a few news articles and try to automatically classify sentences belonging to the same news article. # This example demonstrates the need for root word extraction, which will increase accuracy significantly. # text = [ # Article 1 'Freezing temperatures have gripped the nation, making Wednesday the coldest day yet this winter.', 'Morning lows plunged to minus 16-point-three degrees Celsius in Seoul , the lowest to be posted during this year’s cold season.', 'As of 7 a.m. Wednesday , morning lows stood at minus 15-point-four degrees in Daejeon , nearly minus 22 degrees in the Daegwallyeong mountain pass in Pyeongchang and minus 14 degrees in Gangneung.', 'Due to the wind chill factor, temperatures stood at nearly minus 23 degrees in Seoul , minus 25 in Incheon and roughly minus 36 degrees in Daegwallyeong .', 'An official of the Korea Meteorological Administration said the nation will continue to see subzero temperatures for the time being with the central regions and some southern inland areas projected to see morning lows plunge below minus 15 degrees', 'Currently , a cold wave warning is in place for Seoul , Incheon , Daejeon and Sejong as well as the provinces of Gangwon , Chungcheong , North Jeolla and North Gyeongsang.', # Article 2 'There are two primary motivations for keeping Bitcoin' 's inventor keeping his or her or their identity secret.', 'One is privacy. As Bitcoin has gained in popularity – becoming something of a worldwide phenomenon – Satoshi Nakamoto would likely garner a lot of attention from the media and from governments.', 'The other reason is safety. Looking at 2009 alone , 32,489 blocks were mined; at the then-reward rate of 50 BTC per block, the total payout in 2009 was 1,624,500 BTC, which at today’s prices is over $900 million.', 'One may conclude that only Satoshi and perhaps a few other people were mining through 2009, and that they possess a majority of that $900 million worth of BTC.', 'Someone in possession of that much BTC could become a target of criminals, especially since bitcoins are less like stocks and more like cash, where the private keys needed to authorize spending could be printed out and literally kept under a mattress.', 'While it' 's likely the inventor of Bitcoin would take precautions to make any extortion-induced transfers traceable, remaining anonymous is a good way for Satoshi to limit exposure.', # Article 3 'Some of these models of concurrency are primarily intended to support reasoning and specification, while others can be used through the entire development cycle, including design, implementation, proof, testing and simulation of concurrent systems', 'The proliferation of different models of concurrency has motivated some researchers to develop ways to unify these different theoretical models.', 'The Concurrency Representation Theorem in the actor model provides a fairly general way to represent concurrent systems that are closed in the sense that they do not receive communications from outside.' ] # stopwords не нужны!!! круто!!!! text_tag = [] for sent in text: sent_new = self.stopwordtags.filter_sentence_by_pos_tag_english( word_list=sent) text_tag.append(sent_new) text_sentences_arr = self.txt_preprocessor.preprocess_list_all_langs( sentences_list=text_tag) Log.debugdebug('PRE-PROCESSED ' + str(lang) + ' SENTENCES:\n\r' + str(text_sentences_arr)) # This example is too small in sample size to weigh by IDF (which will instead lower the accuracy) # do_clustering(text=text, stopwords=stopwords, ncenters=3, freq_measure='tf', weigh_idf=False, verbose=0) res_cluster = self.do_clustering( text=text_sentences_arr, ncenters=3, expected_clusters=((0, 1, 2, 3, 4, 5), (6, 7, 8, 9, 10, 11), (12, 13, 14)), test_threshold_inside=0.3, test_threshold_outside=0.7, stopwords_list=None, freq_measure=WordFreqDocMatrix.BY_SIGMOID_FREQ_NORM, test_description=str(lang) + ' normalized, no IDF', ) res.update(other_res_obj=res_cluster) return res
def __init__( self ): self.lang_features = LangFeatures() # Map alphabet name to unicode character set array self.alphabet_dict = {} for alp in self.TESTS_BY_ORDER: self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset( alphabet = alp ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabets used: ' + str(self.alphabet_dict.keys()) ) self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator() Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep)) # Load common words self.common_words = {} self.common_words[LangFeatures.LANG_EN] = English() self.common_words[LangFeatures.LANG_ES] = Spanish() self.common_words[LangFeatures.LANG_FR] = French() self.common_words[LangFeatures.LANG_ID] = Indonesian() self.common_words[LangFeatures.LANG_VI] = Vietnamese() # Load stemmers self.word_stemmer = {} for lang in self.SUPPORTED_LANGS: lang_have_verb_conj = self.lang_features.have_verb_conjugation( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.' ) self.word_stemmer[lang] = None if lang_have_verb_conj: try: self.word_stemmer[lang] = Lemmatizer( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__)) return
def __compare_test_challenge(self, test_challenge_calc): if test_challenge_calc != self.test_challenge: Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Test Challenge Fail. Challenge string "' + str(self.challenge) + '". Test Challenge Calculated "' + str(test_challenge_calc) + '", test challenge given "' + str(self.test_challenge)) return False return True
def reconstruct_check(self, sent_vec, keywords_list): Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Reconstructing ' + str(sent_vec) + ' from keywords ' + str(keywords_list)) s_reconstruct_arr = [] for j in range(len(sent_vec)): freq = sent_vec[j] while freq > 0: s_reconstruct_arr.append(keywords_list[j]) freq = freq - 1 return s_reconstruct_arr
def calculate(self): self.x_shape = self.x.shape # How many elements altogether self.x_elements_count = np.product(self.x_shape) self.x_dim = len(self.x_shape) if self.x_elements_count == 0: return np.array([np.nan] * self.x_dim) # No negative numbers assert np.min(self.x) >= 0 assert self.x_dim > 0 assert self.x_elements_count > 0 # Keep the dimension coordinates here self.x_coordinates = np.zeros(shape=[self.x_dim] + [self.x_elements_count]) # For example if x has shape (4,3,2), this number will start with 4*3*2 = 24 repeat_times = self.x_elements_count for dim in range(self.x_dim): # For example first dimension will have a scalar repeat 3*2=6 times (0,0,0,0,0,0,1,1,1,1,1,1,..) # as each row will have 6 elements in total, # 2nd dimension will repeat 2 times (0,0,1,1,2,2,..) as each row will have 2 elements in total repeat_times = repeat_times / self.x_shape[dim] # Each number 0, 1, 2, ... is repeated by the number of times dim_coor = np.array(list(range( self.x_elements_count))) // repeat_times # Modulo the dimension length dim_coor = dim_coor % self.x_shape[dim] self.x_coordinates[dim, ] = dim_coor # Reshape so that the dimensions after the first one is equal to the shape of x self.x_coordinates = np.reshape(self.x_coordinates, newshape=[self.x_dim] + list(self.x_shape)) Log.debugdebug('Coordinates of x by dimension:\n\r' + str(self.x_coordinates)) cm = np.zeros(shape=[self.x_dim]) for dim in range(self.x_dim): if np.sum(self.x) > 0: cm[dim] = np.sum(self.x_coordinates[dim] * self.x) / np.sum( self.x) else: ones_arr = np.ones(shape=self.x_shape) cm[dim] = np.sum( self.x_coordinates[dim] * ones_arr) / np.sum(ones_arr) return cm
def hash_compression( self, s, # By default we return the original hash desired_byte_length=32): if desired_byte_length % 4 != 0: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Desired byte length must be 0 modulo-4, given = ' + str(desired_byte_length)) m = hashlib.sha256() m.update(bytes(s, encoding=Obfuscate.STRING_ENCODING)) # This will return a bytes list of length 32 h = m.digest() if len(h) % 4 != 0: raise Exception( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Hash bytes length must be 0 modulo-4, got = ' + str(h)) # We compress to 8 bytes from the 32 bytes # The original SHA-256 appends 8 parts concatenated together, we break into 4 parts and xor them # 4 blocks n_blocks = int(len(h) / desired_byte_length) # 8 bytes block length block_len = int(len(h) / n_blocks) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Number of blocks = ' + str(n_blocks) + ', block length = ' + str(block_len)) # First block bytes_xor = h[0:block_len] for i in range(1, n_blocks, 1): idx_start = i * block_len idx_end = (i + 1) * block_len cur_block = h[idx_start:idx_end] if len(bytes_xor) != len(cur_block): raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Different block lengths "' + str(bytes_xor) + '", and "' + str(cur_block) + '"') bytes_xor = self.xor_bytes(b1=bytes_xor, b2=cur_block) return bytes_xor
def transform_input_for_model( self, # This should be a list of words as a sentence x_input, word_freq_model=FeatureVector.COL_FREQUENCY, ): # # This could be numbers, words, etc. # features_model = list(self.get_model_features()) # Log.debug( # str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) # + ': Predicting v = ' + str(v_feature_segmented) # + ' using model features:\n\r' + str(features_model) # ) # # Convert sentence to a mathematical object (feature vector) # model_fv = FeatureVector() model_fv.set_freq_feature_vector_template(list_symbols=features_model) # Get feature vector of text try: df_fv = model_fv.get_freq_feature_vector(text_list=x_input) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Exception occurred calculating FV for "' + str(x_input) \ + '": Exception "' + str(ex) \ + '\n\rUsing FV Template:\n\r' + str(model_fv.get_fv_template()) \ + ', FV Weights:\n\r' + str(model_fv.get_fv_weights()) Log.critical(errmsg) raise Exception(errmsg) # This creates a single row matrix that needs to be transposed before matrix multiplications # ndmin=2 will force numpy to create a 2D matrix instead of a 1D vector # For now we make it 1D first assert word_freq_model in df_fv.columns, '"' + str( word_freq_model) + '" must be in ' + str(df_fv.columns) fv_text_1d = np.array(df_fv[word_freq_model].values, ndmin=1) if fv_text_1d.ndim != 1: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Expected a 1D vector, got ' + str(fv_text_1d.ndim) + 'D!') Log.debugdebug(fv_text_1d) x_transformed = npUtil.NumpyUtil.convert_dimension(arr=fv_text_1d, to_dim=2) return x_transformed
def get_pct_intersection_with_common_words( self, word_list, # In the case of Vietnamese, we might have to form words from the syllables max_word_n_tuple = 1 ): if max_word_n_tuple == 1: lang_intersection = set(word_list).intersection(self.get_common_words()) pct_intersection = len(lang_intersection) / len(set(word_list)) else: # Means we are looking not just at the current token, but form a word from # continuous tokens up to max_word_n_tuple (usually not more than 2) len_word_list = len(word_list) count_int = 0 cur_index = 0 actual_word_count = 0 # Loop by each token in the word list (or rather token list) while cur_index < len_word_list: max_n_tuple_lookforward = min(max_word_n_tuple, len_word_list-cur_index) for j in range(max_n_tuple_lookforward,0,-1): # Look from j tokens ahead end_index = cur_index+j # For the j-tuple word w = ' '.join(word_list[cur_index:end_index]) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Test word "' + str(w) + '", cur_index=' + str(cur_index) + ', j=' + str(j)) if w in self.get_common_words(): count_int += 1 # Move forward to the end of the token from the word found cur_index += j-1 Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Found word "' + str(w) + '"') break cur_index += 1 actual_word_count += 1 Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Count Intersection = ' + str(count_int) + ', actual word count = ' + str(actual_word_count) ) pct_intersection = count_int / actual_word_count Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.lang) + '" intersection = ' + str(pct_intersection) ) return pct_intersection
def encode( self, # bytes format data): try: if self.cipher_mode == AES.MODE_EAX: cipher = AES.new(key=self.key, mode=self.cipher_mode, nonce=self.nonce) cipherbytes, tag = cipher.encrypt_and_digest(data) return AES_Encrypt.EncryptRetClass( cipher_mode=self.cipher_mode_str, ciphertext_b64=b64encode(cipherbytes).decode( self.text_encoding), plaintext_b64=None, tag_b64=b64encode(tag).decode(self.text_encoding), nonce_b64=b64encode(self.nonce).decode(self.text_encoding)) elif self.cipher_mode == AES.MODE_CBC: # 1-16, make sure not 0, other wise last byte will not be block length length = AES_Encrypt.DEFAULT_BLOCK_SIZE_AES_CBC - ( len(data) % AES_Encrypt.DEFAULT_BLOCK_SIZE_AES_CBC) # Pad data with the original length, so when we decrypt we can just take data[-1] # as length of data block data += bytes(chr(length), encoding=STR_ENCODING) * length Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Padded length = ' + str(length)) cipher = AES.new(key=self.key, mode=self.cipher_mode, iv=self.nonce) cipherbytes = cipher.encrypt(data) return AES_Encrypt.EncryptRetClass( cipher_mode=self.cipher_mode_str, ciphertext_b64=b64encode(cipherbytes).decode( self.text_encoding), plaintext_b64=None, tag_b64=None, nonce_b64=b64encode(self.nonce).decode(self.text_encoding)) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported mode "' + str(self.cipher_mode) + '".') except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error encoding data "' + str(data) + '" using AES ". Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg)
def process_noun(self, word): l = len(word) ces = self.case_endings_by_len[LemmatizerBase.CE_NOUN] for i in ces.keys(): postfix = word[(l - i):l] check = postfix in ces[i] Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Check ' + str(check) + ' for "' + str(postfix) + '" in ' + str(ces[i])) if check: return word[0:(l - i)] return None
def filter_sentence_by_pos_tag_english( self, word_list, keep_tags=DEFAULT_KEEP_TAGS_ENG, ): if type(word_list) is str: word_list = word_tokenize(text=word_list, language='english') words_postags = pos_tag(word_list) sent_filtered = [w for w, t in words_postags if (t in keep_tags)] Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': POS TAGs: ' + str(words_postags)) Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Filtered sentence: ' + str(sent_filtered)) return sent_filtered
def convert_ascii_string_to_other_alphabet( ascii_char_string, # Default to CJK Unicode Block unicode_range=BLOCK_CHINESE, # If the characters come from a hexdigest from a hash, we can compress 4 times, # otherwise for a random ascii string, we can only compress 2 characters to 1 chinese. group_n_char=2): uni_len = unicode_range[1] - unicode_range[0] + 1 r = len(ascii_char_string) % 4 if r != 0: # Append 0's ascii_char_string = ascii_char_string + '0' * (4 - r) # raise Exception('Hash length ' + str(len(hash_hex_string)) # + ' for "' + str(hash_hex_string) + '" not 0 modulo-4') hash_zh = '' len_block = int(len(ascii_char_string) / group_n_char) for i in range(0, len_block, 1): idx_start = group_n_char * i idx_end = idx_start + group_n_char s = ascii_char_string[idx_start:idx_end] # Convert to Chinese, Korean, etc if group_n_char == 2: ord_arr = np.array([ord(x) for x in s]) val = ord_arr * np.array( [2**(8 * (x - 1)) for x in range(len(ord_arr), 0, -1)]) val = np.sum(val) Log.debug('Index start=' + str(idx_start) + ', end=' + str(idx_end) + ', s=' + str(s) + ', ordinal=' + str(ord_arr) + ', val=' + str(hex(val))) cjk_unicode = (val % uni_len) + unicode_range[0] hash_zh += chr(cjk_unicode) elif group_n_char == 4: Log.debug('Index start=' + str(idx_start) + ', end=' + str(idx_end) + ', s=' + str(s)) n = int('0x' + str(s), 16) cjk_unicode = (n % uni_len) + unicode_range[0] hash_zh += chr(cjk_unicode) Log.debugdebug('From ' + str(idx_start) + ': ' + str(s) + ', n=' + str(n) + ', char=' + str(chr(cjk_unicode))) return hash_zh
def transform_input_for_model( self, # For the model to interpret and transform in to x usable for model input # (e.g. map using one-hot dictionaries) x_input, word_freq_model = None, ): try: Log.debugdebug('***** x input: ' + str(x_input)) # We expect x_input to be an np array of words if type(x_input) is np.ndarray: x_input = x_input.tolist() if type(x_input) not in (list, tuple): raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '". Expect list/tuple type, got type "' + str(type(x_input)) + '" for x input: ' + str(x_input) ) if self.x_one_hot_dict_inverse is None: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '" x one hot not yet initialized!' ) x = [] for i in range(len(x_input)): word = x_input[i] if word in self.x_one_hot_dict_inverse.keys(): x.append(self.x_one_hot_dict_inverse[word]) else: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '", could not map input value "' + str(word) + '" to code x. Not in x one hot dictionary.' ) # TODO Pad with 0's to satisfy neural network in put length input_shape = self.network.layers[0].input_shape input_len = input_shape[1] Log.debugdebug('***** INPUT SHAPE ' + str(input_shape) + ', len ' + str(input_len) + ', x = ' + str(x)) while len(x) < input_len: x = [0] + x Log.debugdebug(' ***** padded x: ' + str(x)) x = np.array(x) x_transformed = NumpyUtil.convert_dimension(arr=x, to_dim=2) Log.debugdebug(' ***** transformed x: ' + str(x_transformed)) return x_transformed except Exception as ex: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '", exception tranforming ' + str(x_input) + '. Exception: ' + str(ex) )
def test_textcluster_chinese(self): res = ut.ResultObj(count_ok=0, count_fail=0) lang = lf.LangFeatures.LANG_ZH self.txt_preprocessor.stopwords_list = [] text = [ # Article 1 '人工智能 : 英 、 中 、 美 上演 “ 三国演义 ”', '英国 首相 特里莎·梅 周四 (1月 25日) 在 瑞士 达沃斯 世界 经济 论坛 上 宣布 , 英国 在 人工智能 ( AI ) 领域 要 争 当 世界 领头羊。', '一周 后 , 她 将 率 英国 经贸 代表团 访 华 , 到 北京 和 上海 展开 " 历史性 访问 "。 一周 前 , 中国 发表 《 人工智能 标准化 白皮书 》。', '中国 媒体 把 2017 年 称为 " AI 年 ", 2018 则 是 AI 从 学术 飞入 产业 、 普及 应用 的 关键 年 。', '围绕 AI , 中美 正 胶着 于 争霸 竞赛 ,而 中英 在 科技 、工商 和 金融界 的 互动 将 产生 怎样 的 结果 ,引 人 关注' '。', # Article 2 '叙利亚 俄军 遇袭 恐怖分子 用 无人机 “ 群攻 ”', '俄军 在 叙利亚 军事基地 遭到 攻击 后 , 俄罗斯 国防部 警告 说 , 恐怖分子 已 获得 先进 无人机 技术 , 能够 在 全世界 发动 攻击 。', '俄罗斯 总参谋部 无人机 部门 负责人 亚历山大 · 维科夫 少将 说 , 恐怖分子 使用 无人机 发动 攻击 的 威胁 已经 不再 是 不可能 的 事情,', '恐怖分子 已经 利用 无人机 攻击 俄军 在 叙利亚 的 克 美 明 空军基地 和 塔尔图斯 的 一个 港口', '他 还 说 , 在 1月 6日 发动 攻击 的 技术 评估 显示 ," 在 世界 所有 其他 地方 使用 无人机 发动 恐怖 攻击 已经 成为 现实 威胁"' # Article 3 ] text_sentences_arr = self.txt_preprocessor.preprocess_list_all_langs( sentences_list=text) Log.debugdebug('PRE-PROCESSED ' + str(lang) + ' SENTENCES:\n\r' + str(text_sentences_arr)) # This example is too small in sample size to weigh by IDF (which will instead lower the accuracy) # do_clustering(text=text, stopwords=stopwords, ncenters=2, freq_measure='tf', weigh_idf=False, verbose=0) res_cluster = self.do_clustering( text=text_sentences_arr, ncenters=2, expected_clusters=((0, 1, 2, 3, 4), (5, 6, 7, 8, 9)), test_threshold_inside=0.7, test_threshold_outside=0.3, test_description='1. ' + str(lang) + ' normalized, no IDF', freq_measure=WordFreqDocMatrix.BY_SIGMOID_FREQ_NORM, ) res.update(other_res_obj=res_cluster) # do_clustering(text=text, stopwords=stopwords, ncenters=2, freq_measure='frequency', weigh_idf=False, verbose=0) return res
def xor_bytes(self, b1, b2): t12 = zip(b1, b2) res_xor = [] for x in t12: byte_xor = x[0] ^ x[1] Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + 'XOR "' + str(hex(x[0])) + '" and "' + str(hex(x[1])) + '" = ' + str(hex(byte_xor))) res_xor.append(byte_xor) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': XOR between "' + str(self.hexdigest(b1)) + '" and "' + str(self.hexdigest(b2)) + '" = "' + str(self.hexdigest(res_xor)) + '"') return res_xor
def to_data_frame( self, # Pre-processed sentences are hard to read, so we use original sentences sentences_list_no_preprocessing, # List of dictionary (if word-weights) representing a topic topic_words, # numpy ndarray doc_labels, ): # Convenient data frame for the topics & original documents df_classified = pd.DataFrame() for doc_idx in range(np.max(doc_labels) + 1): Log.debugdebug(str(self.__class__) + ': Cluster #' + str(doc_idx)) Log.debugdebug( str(self.__class__) + ': Word-Value Center: ' + str(topic_words[doc_idx])) Log.debugdebug( str(self.__class__) + ': Words Center: ' + str(topic_words[doc_idx].keys())) cluster_words = str(topic_words[doc_idx].keys()) topic_sentences = [] for j in range(len(sentences_list_no_preprocessing)): if doc_labels[j] == doc_idx: # print('\t\t' + str(sentences_list_no_preprocessing[j])) topic_sentences.append(sentences_list_no_preprocessing[j]) df_topic = pd.DataFrame({ 'ClusterNo': doc_idx, 'ClusterTopWords': cluster_words, 'Sentence': topic_sentences, }) df_classified = df_classified.append(df_topic) return df_classified
def get_freq_feature_vector( self, # A word array. e.g. ['this','is','a','sentence','or','just','any','word','array','.'] text_list, feature_as_presence_only = False, # Log base has no effect on LogFreqNormalized & LogFreqProbability as it is just a constant factor log_base = DEFAULT_LOG_BASE, ): counter = col.Counter(text_list) # Order the counter counter = counter.most_common() symbols = [x[0] for x in counter] freqs = np.array( [x[1] for x in counter] ) # lg.Log.debugdebug( # str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) # + ': Symbols ' + str(symbols) # + ', Frequencies ' + str(freqs) # + ', Presence ' + str(presence) # ) # If <feature_as_presence_only> flag set, we don't count frequency, but presence if feature_as_presence_only: presence = (freqs >= 1) * 1 freqs = presence df_counter = pd.DataFrame({ self.COL_SYMBOL: symbols, self.COL_FREQUENCY: freqs }) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converted text "' + str(text_list) + '" to ' + str(df_counter.values) ) df_merge = self.get_freq_feature_vector_df( df_text_counter = df_counter, log_base = log_base, ) return df_merge
def calculate(self): losses = [] # The losses of each class has already been conveniently broken up by the categorical format for real_prob, given_probs in zip(self.p_real_prob_labels, self.q_given_probs): # Just to be sure in case numbers don't sum up to 1 for probabilities given_probs_normalized = given_probs / given_probs.sum( axis=-1, keepdims=True) assert abs(np.sum(given_probs_normalized) - 1.0) < CategoricalCrossEntropy.SMALL_NUMBER Log.debugdebug('Given Probs: ' + str(given_probs_normalized)) # Calculate the number of bits required to represent this information info_bits = -np.log( np.maximum(CategoricalCrossEntropy.SMALL_NUMBER, given_probs_normalized)) Log.debugdebug('Information Bits: ' + str(info_bits)) # If the label is categorical, the loss is only the loss of the single non-zero category usually loss = np.sum(real_prob * info_bits, axis=-1, keepdims=False) losses.append(loss) Log.debugdebug('Losses: ' + str(losses)) # We can actually ignore the constant N term if we wish return np.sum(losses) * (1 / self.N)
def search_close_words( self, word, # Cost can be any measure of edit distance, e.g. Levenshtein, Damerau-Levenshtein, etc. max_cost=2, edit_distance_algo=EditDistance.EDIT_DIST_ALGO_DAMLEV): # Returns tuples of (word, edit-distance) # E.g. from word bg to [('be',1), ('big',1), ('bag',1), ('brag',2)] results = TrieNode.search_close_words( trie=self.trie, word=word, max_cost=max_cost, edit_distance_algo=edit_distance_algo) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For word "' + str(word) + '", found trie node matches ' + str(results)) if (results is None) or (len(results) == 0): return None # # Можно использовать любую весовую систему слов # corrected_words = [] edit_distances = [] eidf_values = [] for obj in results: # The corrected word returned in tuple cor_word = obj[0] # The edit distance returned in tuple edit_dist = obj[1] if self.use_word_weighting: eidf_val = self.eidf_value[self.eidf_words == cor_word] if len(eidf_val) != 1: Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No EIDF value found for corrected word "' + str(cor_word) + '"') continue else: eidf_values.append(round(eidf_val[0], 2)) else: eidf_values.append(None) corrected_words.append(cor_word) edit_distances.append(edit_dist) df = pd.DataFrame({ SpellCheckWord.COL_CORRECTED_WORD: corrected_words, SpellCheckWord.COL_EDIT_DISTANCE: edit_distances, SpellCheckWord.COL_EIDF_VALUE: eidf_values }) df = df.sort_values(by=[ SpellCheckWord.COL_EDIT_DISTANCE, SpellCheckWord.COL_EIDF_VALUE ], ascending=True) df = df.reset_index(drop=True) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Corrected words and eidf values: ' + str(df)) return df
def process_text_training_data(self, ): # The algorithm to segment words works as follows: # If segmented text returned from DB is None or shorter than text, we will process the text. # However if the flag self.reprocess_all_text == True, we segment no matter what. Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': START SEGMENT & STEM DB TRAINING DATA, FORCE RESEGMENT ALL = ' + str(self.reprocess_all_text)) td_total_rows = self.df_training_data.shape[0] count = 0 for idx_row in self.df_training_data.index: count = count + 1 text_from_db = str(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT].loc[idx_row]) text_processed_from_db = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].loc[idx_row] intent_td_id = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TRAINING_DATA_ID].loc[idx_row] intent_id = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx_row] intent_name = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx_row] # Internal Counter internal_counter = self.df_training_data[ TrDataPreprocessor.TD_INTERNAL_COUNTER].loc[idx_row] Log.debugdebug('Processing index row "' + str(idx_row) + '" ' + str(self.df_training_data.loc[idx_row]) + '"') if type(text_from_db) is not str: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Text from DB "' + str(text_from_db) + '" not string type.') text_from_db = str(text_from_db) # When a text is updated in DB/storage, this field should be cleared in DB to NULL if text_processed_from_db is None: text_processed_from_db = '' possible_langs = self.lang_detect.detect(text=text_from_db) # Empty list if not possible_langs: lang_detected = self.language_main else: lang_detected = possible_langs[0] # If detected language not supported if lang_detected not in [self.language_main ] + self.languages_additional: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For "' + str(self.model_identifier) + '", detected lang "' + str(lang_detected) + '" not in languages supported') lang_detected = self.language_main # Update data frame with language detected self.df_training_data[DaehuaTrainDataModel.COL_TDATA_TEXT_LANG].at[idx_row] = \ lang_detected #if lang_detected != self.language_main: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang_detected) + '" main lang "' + str(self.language_main) + '" for text "' + str(text_from_db) + '".') # # Sanity check only. Should not happen since after every training data update, # NULL would be written back to the TextSegmented column. # Because we don't want to reprocess all text which takes time, so we guess first # is_likely_processed_text_changed = len( text_processed_from_db) < len(text_from_db) # If a language has verb conjugation, we cannot just compare length as the original text could be longer if self.lang_have_verb_conj[lang_detected]: # So we just hardcode is_likely_processed_text_changed = len( text_processed_from_db) <= 8 if is_likely_processed_text_changed: if (intent_td_id is not None) and (intent_td_id > 0): # Warn only if it is not our own inserted data Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Text "' + str(text_from_db) + '" likely has incorrect segmentation "' + str(text_processed_from_db) + '".') # # We only reprocess the text if there is some likelihood of change # if self.reprocess_all_text or is_likely_processed_text_changed: processed_text_str = self.txt_preprocessor[ lang_detected].process_text(inputtext=text_from_db, return_as_string=True) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Text "' + str(text_from_db) + '" processed text "' + str(processed_text_str) + '".') is_text_processed_changed = not (text_processed_from_db == processed_text_str) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No ' + str(count) + ' of ' + str(td_total_rows) + ': Tr Data ID "' + str(intent_td_id) + '". Force segment = ' + str(self.reprocess_all_text) + '\n\r Text "' + str(text_from_db) + '". Processed to "' + str(processed_text_str) + '"' + ', changed = ' + str(is_text_processed_changed)) # Training ID 0 are those we inserted ourselves so no need to update anything if is_text_processed_changed: # Update the column self.df_training_data[DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].at[idx_row] = \ processed_text_str # For intent name we inserted, no need to warn if (intent_td_id is not None) and (intent_td_id > 0): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Processed text different. Text "' + str(text_from_db) + '\n\r new processed text "' + str(processed_text_str) + '"' + '\n\r old processed text "' + str(text_processed_from_db) + '"') row_changed = self.__get_row_to_append_to_training_data( intent_id=intent_id, intent_name=intent_name, text=text_from_db, text_id=intent_td_id, processed_text=processed_text_str, lang_detected=lang_detected, internal_counter=internal_counter) self.list_of_rows_with_changed_processed_text.append( row_changed) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Appended changed row: ' + str(row_changed)) else: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Processed text ' + str(count) + ' ok "' + str(processed_text_str) + '" from "' + str(text_from_db) + '"') else: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Training data ID ' + str(intent_td_id) + ': No ' + str(count) + ' of ' + str(td_total_rows) + ': Nothing to do, OK segmented/processed from DB "' + str(text_processed_from_db) + '"') return
def preprocess_training_data_text(self): # Just add intent names into the training data, no text processing self.add_intent_name_to_training_data() self.process_text_training_data() self.add_latin_form_to_training_data() try: from nwae.ml.text.TxtTransform import TxtTransform # Conversion to padded docs res = TxtTransform(docs=list(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED]), labels=list(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_ID]), langs=list(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT_LANG]) ).create_padded_docs() Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Padded Docs: ' + str(res.padded_encoded_docs) + ', Labels: ' + str(res.encoded_labels)) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Labels Categorical: ' + str(res.encoded_labels_categorical)) self.embedding_params = EmbeddingParams( x=res.padded_encoded_docs, x_original=res.original_docs, y=np.array(res.encoded_labels), y_original=res.y_original, x_one_hot_dict=res.x_one_hot_dict, y_one_hot_dict=res.y_one_hot_dict, max_sent_len=res.max_x_length, max_label_val=max(res.encoded_labels), vocab_size=res.vocabulary_dimension) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converted ' + str(len(self.embedding_params.x)) + ' rows padded docs. Max sentence length = ' + str(self.embedding_params.max_sent_len) + ', max label value = ' + str(self.embedding_params.max_label_val) + ', vocabulary size = ' + str(self.embedding_params.vocab_size) + ', x one hot dict: ' + str(self.embedding_params.x_one_hot_dict)) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Original docs:\n\r' + str(self.embedding_params.x_original) + '\n\rEncoded padded docs\n\r:' + str(self.embedding_params.x) + '\n\rOriginal labels\n\r' + str(self.embedding_params.y_original) + '\n\rEncoded labels\n\r' + str(self.embedding_params.y)) except Exception as ex_embed: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error converting to training text to embed params: ' + str(ex_embed) Log.warning(errmsg) # Don't raise error # raise Exception(errmsg) return (self.df_training_data, self.embedding_params)
def unit_test_predict_classes( self, word_freq_model, include_match_details=False, top=5, ): Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Test predict classes using model "' + str(self.model_name) + '".') # Unit test using direct text (PredictClass.py) is in PredictClass.py itself model_obj = ModelHelper.get_model( model_name=self.model_name, model_params=None, identifier_string=self.identifier_string, dir_path_model=self.ut_params.dirpath_model, training_data=None) model_obj.start() model_obj.wait_for_model() #model_obj.load_model_parameters() test_x = UnitTestMetricSpaceModel.DATA_TEST_X test_x_name = UnitTestMetricSpaceModel.DATA_TEST_X_NAME model_x_name = model_obj.get_model_features() if model_x_name is None: model_x_name = UnitTestMetricSpaceModel.DATA_X_NAME word_freq_model_mapped = WordFreqDocMatrix.map_to_feature_vect_word_freq_measure( freq_measure=word_freq_model) if word_freq_model_mapped in [ WordFreqDocMatrix.BY_SIGMOID_FREQ, WordFreqDocMatrix.BY_SIGMOID_FREQ_NORM ]: test_x = 2 * ((1 / (1 + np.exp(-test_x))) - 0.5) elif word_freq_model_mapped in [ WordFreqDocMatrix.BY_LOG_FREQ, WordFreqDocMatrix.BY_LOG_FREQ_NORM ]: test_x = np.log(1 + test_x) else: pass Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Mapped to word freq model "' + str(word_freq_model_mapped) + '" to ' + str(test_x)) if model_x_name.ndim == 2: model_x_name = model_x_name[0] Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model x_name: ' + str(model_x_name)) # Reorder by model x_name df_x_name = pd.DataFrame(data={ 'word': model_x_name, 'target_order': range(0, len(model_x_name), 1) }) df_test_x_name = pd.DataFrame( data={ 'word': test_x_name, 'original_order': range(0, len(test_x_name), 1) }) # Log.debug('**** Target Order: ' + str(model_x_name)) # Log.debug('**** Original order: ' + str(test_x_name)) # Left join to ensure the order follows target order and target symbols df_x_name = df_x_name.merge(df_test_x_name, how='left') # Log.debug('**** Merged Order: ' + str(df_x_name)) # Then order by original order df_x_name = df_x_name.sort_values(by=['target_order'], ascending=True) # Then the order we need to reorder is the target_order column reorder = np.array(df_x_name['original_order']) self.res_final.update_bool(res_bool=UnitTest.assert_true( observed=reorder.tolist(), expected=self.REORDER_FEATURE_NAMES_WITH_UNK.tolist(), test_comment='Test reorder of feature names ' + str(reorder))) test_x_transpose = test_x.transpose() Log.debugdebug(test_x_transpose) reordered_test_x = np.zeros(shape=test_x_transpose.shape) Log.debugdebug(reordered_test_x) for i in range(0, reordered_test_x.shape[0], 1): reordered_test_x[i] = test_x_transpose[reorder[i]] reordered_test_x = reordered_test_x.transpose() Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Reordered test x = ' + str(reordered_test_x)) x_classes_expected = self.y # Just the top predicted ones all_y_observed_top = [] all_y_observed = [] mse = 0 count_all = reordered_test_x.shape[0] Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Predict classes for x:\n\r' + str(reordered_test_x)) prf_start = prf.Profiling.start() for i in range(reordered_test_x.shape[0]): v = npUtil.NumpyUtil.convert_dimension(arr=reordered_test_x[i], to_dim=2) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Testing x: ' + str(v)) if self.model_name == ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE: predict_result = model_obj.predict_class( x=v, include_match_details=include_match_details, top=top) else: predict_result = model_obj.predict_class(x=v) y_observed = predict_result.predicted_classes all_y_observed_top.append(y_observed[0]) all_y_observed.append(y_observed) top_class_distance = predict_result.top_class_distance match_details = predict_result.match_details Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Point v ' + str(v) + ', predicted ' + str(y_observed) + ', Top Class Distance: ' + str(top_class_distance) + ', Match Details:\n\r' + str(match_details)) if self.model_name == ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE: metric = top_class_distance mse += metric**2 prf_dur = prf.Profiling.get_time_dif(prf_start, prf.Profiling.stop()) Log.important( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ' PROFILING ' + str(count_all) + ' calculations: ' + str(round(1000 * prf_dur, 0)) + ', or ' + str(round(1000 * prf_dur / count_all, 2)) + ' milliseconds per calculation') # Compare with expected compare_top_x = {} for t in range(1, top + 1, 1): # True or '1' means not correct or error compare_top_x[t] = np.array([True] * len(all_y_observed)) for i in range(len(all_y_observed)): matches_i = all_y_observed[i] if x_classes_expected[i] in matches_i[0:t]: # False of '0' means no error compare_top_x[t][i] = False self.res_final.count_ok += 1 * (t == 1) else: self.res_final.count_fail += 1 * (t == 1) Log.info(compare_top_x[t]) Log.info('Total Errors (compare top #' + str(t) + ') = ' + str(np.sum(compare_top_x[t] * 1))) Log.info('mse = ' + str(mse)) if self.model_name == ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE: predict_result = model_obj.predict_classes( x=reordered_test_x, include_match_details=include_match_details, top=top) Log.info('Predicted Classes:\n\r' + str(predict_result.predicted_classes)) Log.info('Top class distance:\n\r' + str(predict_result.top_class_distance)) Log.info('Match Details:\n\r' + str(predict_result.match_details)) Log.info('MSE = ' + str(predict_result.mse)) model_obj.join() # # Test using PredictClass # from nwae.lang.LangFeatures import LangFeatures from nwae.ml.PredictClass import PredictClass predict = PredictClass( model_name=ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE, identifier_string=UnitTestMetricSpaceModel.IDENTIFIER_STRING, dir_path_model=self.ut_params.dirpath_model, lang=LangFeatures.LANG_KO, dir_wordlist=self.ut_params.dirpath_wordlist, postfix_wordlist=self.ut_params.postfix_wordlist, dir_wordlist_app=self.ut_params.dirpath_app_wordlist, postfix_wordlist_app=self.ut_params.postfix_app_wordlist, dirpath_synonymlist=self.ut_params.dirpath_synonymlist, postfix_synonymlist=self.ut_params.postfix_synonymlist, word_freq_model=word_freq_model_mapped, do_spelling_correction=False, do_profiling=True) for i in range(len(self.DATA_TEXTS)): label = self.DATA_Y[i] text_arr = self.DATA_TEXTS[i] text = ' '.join(text_arr) # Return all results in the top 5 res = predict.predict_class_text_features( inputtext=text, match_pct_within_top_score=0, include_match_details=True, top=5, ) self.res_final.update_bool(res_bool=UnitTest.assert_true( observed=res.predict_result.predicted_classes[0], expected=label, test_comment='Test "' + str(text) + '" label ' + str(label))) Log.debug( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': ' + str(i) + '. Match Details word freq model "' + str(predict.word_freq_model) + '" ' + str(res.predict_result.match_details)) predict.word_freq_model = WordFreqDocMatrix.map_to_feature_vect_word_freq_measure( freq_measure=WordFreqDocMatrix.BY_SIGMOID_FREQ) res = predict.predict_class_text_features( inputtext=text, match_pct_within_top_score=0, include_match_details=True, top=5, ) Log.debug( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': ' + str(i) + '. Match Details word freq model "' + str(predict.word_freq_model) + '" ' + str(res.predict_result.match_details)) # Kill any background jobs predict.stop_model_thread() return