def xor_string(self, s1, s2): Log.debug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': XOR between "' + str(s1) + '" and "' + str(s2) + '".') len_s1 = len(s1) len_s2 = len(s2) len_max = max(len(s1), len(s2)) # Append to the shorter one, in a repeat manner for i in range(len(s1), len_max, 1): s1 += s1[(i - len_s1)] for i in range(len(s2), len_max, 1): s2 += s2[(i - len_s2)] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': After appending, XOR between "' + str(s1) + '" and "' + str(s2) + '".') Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': s1 "' + str(s1) + '", s2 "' + str(s2) + '"') b1 = bytes(s1, encoding=Obfuscate.STRING_ENCODING) b2 = bytes(s2, encoding=Obfuscate.STRING_ENCODING) bytes_xor = self.xor_bytes(b1=b1, b2=b2) return bytes_xor
def scrape_url( self, url, parser='html.parser', tag_to_find='p', ): try: sents = [] resp = requests.get(url=url, ) soup = BeautifulSoup(resp.content, parser) contents_tag = soup.find_all(tag_to_find) for cont in contents_tag: txt = StringUtils.trim(cont.get_text()) sent_list = txt.split('。') sent_list = [StringUtils.trim(s) for s in sent_list if s] if len(sent_list): sents += sent_list Log.debug('Split "' + str(txt) + '" into:' + str(sent_list)) # [Log.debug('\t"' + str(s) + '"') for s in sent_list] return sents except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error scraping url "' + str(url) + '", exception: ' + str(ex))
def run_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) lang = LangFeatures.LANG_TH test_sent = [ # Case words segmented correctly to ['มี', 'เงน', 'ที่', 'ไหน'] and 'เงน' corrected to 'เงิน' ['มีเงนที่ไหน', ['มี', 'เงิน', 'ที่', 'ไหน']], # ['การแพร่ระบาดของเชื้อไวรัสโควิด-19', # ['การ', 'แพร่', 'ระบาด', 'ของ', 'เชื้อ', 'ไวรัส', 'โค', 'วิด', '-19']], # ['ในทั่วโลกยังเพิ่มขึ้นไม่หยุด', # ['ใน', 'ทั่ว', 'โลก', 'ยัง', 'เพิ่ม', 'ขึ้น', 'ไม่', 'หยุด']] ] for obj in test_sent: s = obj[0] arr_expected = obj[1] seg = self.word_segmenter[lang].segment_words( text=s, return_array_of_split_words=True) Log.debug('"' + s + '" segmented to ' + str(seg)) arr_cor = self.spell_corr[lang].check(text_segmented_arr=seg) Log.debug('Corrections array: ' + str(arr_cor)) res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=arr_cor, expected=arr_expected, test_comment='Test "' + str(s) + '" to ' + str(arr_cor))) return res_final
def filter_sentence_by_pos_tag_japanese( self, # string or word list word_list, keep_tags=DEFAULT_KEEP_TAGS_JAP, ): try: import nagisa except Exception as ex: raise Exception( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unable to load nagisa: ' + str(ex)) if type(word_list) in [list, tuple]: text = ' '.join(word_list) else: text = word_list words_postags_obj = nagisa.tagging(text) txt_sym_tok = words_postags_obj.words txt_sym_postags = words_postags_obj.postags Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Japanese segmentation ' + str(txt_sym_tok) + ', word & POS tags: ' + str(txt_sym_postags)) words_postags = list(zip(txt_sym_tok, txt_sym_postags)) sent_filtered = [w for w, t in words_postags if (t in keep_tags)] Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': POS TAGs: ' + str(words_postags)) Log.debugdebug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Filtered sentence: ' + str(sent_filtered)) return sent_filtered
def __convert_preprocessed_text_to_training_data_model_for_nn_dense( tr_data_preprocessor ): x = tr_data_preprocessor.embedding_x y = tr_data_preprocessor.embedding_y x_one_hot_dict = tr_data_preprocessor.embedding_x_one_hot_dict n_rows = len(x) max_sentence_len = tr_data_preprocessor.embedding_max_sentence_len max_label_value = tr_data_preprocessor.embedding_max_label_val vocabulary_size = tr_data_preprocessor.embedding_vocab_size Log.debug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Padded Docs: ' + str(x) ) # print('Padded docs: ' + str(res.padded_docs)) # print('List labels: ' + str(res.list_labels)) return tdm.TrainingDataModel( x = x, y = y, x_one_hot_dict = x_one_hot_dict, is_map_points_to_hypersphere = False )
def convert_format(self, filepath, to_format='wav'): file_extension = self.get_audio_filepath_extension(filepath=filepath) filepath_converted = re.sub(pattern='[.][a-zA-Z0-9]+$', repl='.wav', string=filepath) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Convert "' + str(filepath) + '" with extension "' + str(file_extension) + '" New filepath "' + str(filepath_converted) + '"') try: track = AudioSegment.from_file(file=filepath, format=file_extension) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converting "' + str(filepath) + '" to "' + str(filepath_converted) + '"..') file_handle = track.export(filepath_converted, format=to_format) file_handle.close() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successful Conversion from "' + str(filepath) + '" to "' + str(filepath_converted) + '"..') return filepath_converted except Exception as ex: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception converting "' + str(filepath) + '" to "' + str(filepath_converted) + '": ' + str(ex))
def run_unit_test(self): dt = LangDetect() res_final = ut.ResultObj(count_ok=0, count_fail=0) start_all_time = Profiling.start() for text, expected in LangDetectUnitTest.TEST_TEXT_LANG: start_time = Profiling.start() observed = dt.detect(text=text) ms = round( 1000 * Profiling.get_time_dif_secs(start=start_time, stop=Profiling.stop()), 2) Log.debug('Took ' + str(ms) + ' ms') res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=observed, expected=expected, test_comment='test lang "' + str(expected) + '", text "' + str(text) + '"')) end_all_time = Profiling.stop() avg_per_text_ms = 1000 * Profiling.get_time_dif_secs( start=start_all_time, stop=end_all_time) / len( LangDetectUnitTest.TEST_TEXT_LANG) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Average ' + str(round(avg_per_text_ms, 2)) + 'ms per text (total ' + str(len(LangDetectUnitTest.TEST_TEXT_LANG)) + ' sentences)') return res_final
def encode( self, # E.g. {'china': 1, 'russia': 2, ..} word_list, # E.g. [('china', 'dimsum'), ('russia', 'xleb'), ..] word_tuples_list, ): oh_enc = OneHotEncoder() self.words_onehot = oh_enc.encode( feature_list = word_list ) self.word_index_dict = oh_enc.get_feature_index_dict() self.index_word_dict = {v:k for k,v in self.word_index_dict.items()} Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unique word dictionary, length ' + str(len(self.word_index_dict)) + ': ' + str(self.word_index_dict) ) X = [] Y = [] for t in word_tuples_list: root_word = t[0] root_word_index = self.word_index_dict[root_word] close_word = t[1] close_word_index = self.word_index_dict[close_word] X.append(self.words_onehot[root_word_index]) Y.append(self.words_onehot[close_word_index]) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': X: ' + str(X) + '\nY: ' + str(Y) ) return np.array(X), np.array(Y)
def __init__( self, # 16 or 32 byte key key, nonce=None, mode=AES_MODE_EAX, text_encoding='utf-8'): self.key = key Log.debug('Using key ' + str(str(self.key)) + '. Size = ' + str(len(self.key)) + '.') self.cipher_mode_str = mode if self.cipher_mode_str == AES_Encrypt.AES_MODE_EAX: self.cipher_mode = AES.MODE_EAX elif self.cipher_mode_str == AES_Encrypt.AES_MODE_CBC: self.cipher_mode = AES.MODE_CBC else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported AES mode "' + str(self.cipher_mode_str) + '"') if nonce is None: # Must be 16 bytes # nonce = key[0:16] nonce = AES_Encrypt.generate_random_bytes( size=AES_Encrypt.SIZE_NONCE, printable=True) self.nonce = nonce Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Using nonce "' + str(self.nonce) + '". Size = ' + str(len(self.nonce))) self.text_encoding = text_encoding return
def add_parent(self, parent): if parent.dead_node: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Parent "' + str(parent.name) + '" is dead node (cant have children), not adding parent for node "' + str(self.name) + '"' ) return assert type(parent) is MultiTreeNode if parent.name in self.parent_names: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For node "' + str(self.name) + '" parent "' + str(parent.name) + '" already exists' ) else: # Don't add if already exists as parent, anywhere higher up the tree hierarchy if self.is_higher_level(node=parent, supposed_child_node=self): return # Update for both parent and child self.parents.append(parent) self.update() parent.children.append(self) parent.update() Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For node "' + str(self.name) + '" successfully added parent "' + str(parent.name) + '"' )
def check_prediction_stats( self, X, Y, y_predicted, ): Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Checking prediction stats..') # print(y_predicted) # print(type(y_predicted)) # print(y_predicted.shape) # print(np.sum(y_predicted, axis=1).tolist()) # Compare some data count_correct = 0 for i in range(X.shape[0]): data_i = X[i] label_i = Y[i] prob_distribution = y_predicted[i] top_x = NumpyUtil.get_top_indexes(data=prob_distribution, ascending=False, top_x=5) if top_x[0] == label_i: count_correct += 1 Log.debug( str(i) + '. ' + str(data_i) + ': Label=' + str(label_i) + ', predicted=' + str(top_x)) Log.important('Boosting Accuracy = ' + str(100 * count_correct / X.shape[0]) + '%.') return
def __init__(self, format, n_channels, frame_rate, n_frames, sample_width, data_bytes): self.format = format self.n_channels = n_channels self.frame_rate = frame_rate self.n_frames = n_frames self.sample_width = sample_width # total_bytes_per_frame = sample_width * n_channels self.bytes_per_frame = int(self.n_channels * self.sample_width) # Anything above 8 bits are signed, only 8-bit is unsigned self.data_type = np.uint8 if self.sample_width == 2: self.data_type = np.int16 else: raise Exception('Wrong sample width ' + str(self.sample_width) + ' > 2') # total_bytes = total_bytes_per_frames * total_frames self.data_bytes_len = int(self.bytes_per_frame * self.n_frames) self.data_bytes = data_bytes # # Extract channel raw values # audio_as_np = np.frombuffer(buffer=self.data_bytes, dtype=self.data_type) self.np_data = audio_as_np.astype(np.float32) # Normalise float32 array so that values are between -1.0 and +1.0 n_bits = 8 * self.sample_width - 1 self.np_data_normalized = self.np_data / (2**n_bits) # Now add additional dimension for channel self.np_data_by_channel = np.zeros(shape=(self.n_channels, self.n_frames), dtype=self.data_type, order='C') # Just an array 0,1,2,3,... to symbolize indexes n_sample = np.array(list(range(len(self.np_data_normalized)))) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Normalized data by channel shape: ' + str(self.np_data_by_channel.shape)) if self.n_channels > 1: for chnl in range(self.n_channels): # Pick the correct indexes for this channel indexes = n_sample % self.sample_width == chnl channel_n_frames = np.sum(indexes * 1) assert channel_n_frames,\ 'Channel ' + str(chnl) + ' with ' + str(channel_n_frames) + ' frames not ' + str(self.n_frames) # Assign channel data self.np_data_by_channel[chnl] = self.np_data[indexes] else: self.np_data_by_channel[0] = self.np_data.copy() return
def calculate_metric( self, x, prd_attrs, # Для большей матрицы, вычисление нармализации очень медленно force_normalization, metric, ): if force_normalization: # Для большей матрицы, это вычисление очень медленно x_new = self.normalize_euclidean(x=x) prd_attrs_new = self.normalize_euclidean(x=prd_attrs) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': x normalized: ' + str(x_new) + '\n\rp normalized: ' + str(prd_attrs_new)) else: x_new = x prd_attrs_new = prd_attrs """ Суммирование по последней оси """ # sum_axis = 1 + 1 * (ref_dna.shape[0] > 1) sum_axis = len(x_new.shape) - 1 if metric == self.METRIC_COSINE: # Fast method just like NN layer distances = np.matmul(x_new, prd_attrs_new.transpose()) # nan can occur for nan product with 0-vector condition_nan = np.isnan(distances) distances[condition_nan] = -1 if sum_axis == 1: distances = np.reshape(distances, newshape=(prd_attrs_new.shape[0])) indxs_dist_sort = np.flip(np.argsort(distances), axis=0) else: distances = np.reshape(distances, newshape=(x_new.shape[0], prd_attrs_new.shape[0])) indxs_dist_sort = np.flip(np.argsort(distances), axis=1) elif metric == self.METRIC_EUCLIDEAN: # Slow, but more accurate for certain situations diff = x_new - prd_attrs_new distances = np.sqrt(np.sum((diff)**2, axis=sum_axis)) # nan can occur for nan product with 0-vector condition_nan = np.isnan(distances) distances[condition_nan] = np.inf indxs_dist_sort = np.argsort(distances) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No such metric "' + str(metric) + '" supported') Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Distances: ' + str(distances) + ' indexes sorted: ' + str(indxs_dist_sort)) # Return the filtered data frame return indxs_dist_sort
def predict_class_features( self, # This is the point given in feature format, instead of standard array format x_transformed, top=MATCH_TOP, match_pct_within_top_score=CONSTANT_PERCENT_WITHIN_TOP_SCORE, include_match_details=False, # Any relevant ID for logging purpose only id=None): self.wait_for_model_to_be_ready() self.wait_for_all_initializations_to_be_done() self.count_predict_calls = self.count_predict_calls + 1 starttime_predict_class = prf.Profiling.start() predict_result = self.model.predict_class( x=x_transformed, top=top, include_match_details=include_match_details) # # Choose which scores to keep, we only have scores if we included the match details # if include_match_details: df_match = predict_result.match_details if df_match is not None: top_score = float( df_match[ModelInterface.TERM_SCORE].loc[df_match.index[0]]) df_match_keep = df_match[ df_match[ModelInterface.TERM_SCORE] >= top_score * match_pct_within_top_score] df_match_keep = df_match_keep.reset_index(drop=True) # Overwrite data frame predict_result.match_details = df_match_keep y_observed = predict_result.predicted_classes top_class_distance = predict_result.top_class_distance Log.debug( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': Input x: ' + str(x_transformed) + ', observed class: ' + str(y_observed) + ', top distance: ' + str(top_class_distance)) if self.do_profiling: Log.debug( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': ID="' + str(id) + '", x="' + str(x_transformed) + '"' + ' PROFILING predict class: ' + prf.Profiling.get_time_dif_str( starttime_predict_class, prf.Profiling.stop())) return predict_result
def preprocess_text( self ): self.sentences_cleaned = [ self.txt_pp.process_text(inputtext=s, return_as_string=False) for s in self.training_text_list ] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Processed sentences: ' + str(self.sentences_cleaned) ) return self.sentences_cleaned
def __init__( self, # A list of text sentences in list type, already in lowercase and cleaned of None or ''. # Preprocessing assumed to be done and no text processing will be done here. sentences_list, ): self.sentences_list = sentences_list Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Sentences list (before filter):\n\r' + str(self.sentences_list)) return
def __init__( self, form, text_list_confirm_words=DEFAULT_OK, text_confirm_question='Please confirm answer ' + str(DEFAULT_OK), text_ask_field_value_prefix='Please provide', text_newline_char='<br/>', text_space_char=' ', text_html_font_start_tag='<font color="blue">', text_html_font_end_tag='</font>', # For deserializing old objects so the old state is maintained error_count_quit_threshold=2, form_state=None, fill_form_continuous_err_count=0, conv_current_field_index=None, conv_current_field_name=None): if type(form) is not daehua_form.Form: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Wrong form type "' + str(type(form)) + '". Expected type "' + str(daehua_form.Form)) # Keep the original form, and extended params self.form = form self.text_list_confirm_words = [ str(s) for s in text_list_confirm_words ] self.text_confirm_question = str(text_confirm_question) self.text_ask_field_value_prefix = str(text_ask_field_value_prefix) self.text_newline_char = str(text_newline_char) self.text_space_char = str(text_space_char) self.text_html_font_start_tag = str(text_html_font_start_tag) self.text_html_font_end_tag = str(text_html_font_end_tag) self.error_count_quit_threshold = error_count_quit_threshold self.text_form_title = self.form.get_title_text() self.mex_expressions = self.form.mex_form_model Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Mex Expressions: ' + str(self.mex_expressions) + '.') self.form_state = form_state self.fill_form_continuous_err_count = fill_form_continuous_err_count self.conv_current_field_index = conv_current_field_index self.conv_current_field_name = conv_current_field_name if self.form_state is None: self.reset() return
def word_tokenize(self, sentences_list): sentences_segmt = [s.split(' ') for s in sentences_list] # Remove basic punctuations stuck to word sentences_cleanpunc = [ BasicPreprocessor.clean_punctuations(sentence=s) for s in sentences_segmt ] for i in range(len(sentences_cleanpunc)): Log.debug( #str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno), #+ ': Text "' + str(sentences_segmt[i]) #+ '" clean punctuations to: ' + str(sentences_cleanpunc[i]) sentences_cleanpunc[i])
def get_pct_intersection_with_common_words( self, word_list, # In the case of Vietnamese, we might have to form words from the syllables max_word_n_tuple = 1 ): if max_word_n_tuple == 1: lang_intersection = set(word_list).intersection(self.get_common_words()) pct_intersection = len(lang_intersection) / len(set(word_list)) else: # Means we are looking not just at the current token, but form a word from # continuous tokens up to max_word_n_tuple (usually not more than 2) len_word_list = len(word_list) count_int = 0 cur_index = 0 actual_word_count = 0 # Loop by each token in the word list (or rather token list) while cur_index < len_word_list: max_n_tuple_lookforward = min(max_word_n_tuple, len_word_list-cur_index) for j in range(max_n_tuple_lookforward,0,-1): # Look from j tokens ahead end_index = cur_index+j # For the j-tuple word w = ' '.join(word_list[cur_index:end_index]) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Test word "' + str(w) + '", cur_index=' + str(cur_index) + ', j=' + str(j)) if w in self.get_common_words(): count_int += 1 # Move forward to the end of the token from the word found cur_index += j-1 Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Found word "' + str(w) + '"') break cur_index += 1 actual_word_count += 1 Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Count Intersection = ' + str(count_int) + ', actual word count = ' + str(actual_word_count) ) pct_intersection = count_int / actual_word_count Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.lang) + '" intersection = ' + str(pct_intersection) ) return pct_intersection
def train_from_partial_models( self, write_model_to_storage=True, write_training_data_to_storage=False, # Log training events logs=None): # # Load EIDF first # TODO How to ensure there are no missing words? # x_name = self.training_data.get_x_name() try: if type(logs) is list: self.logs_training = logs else: self.logs_training = [] Log.info(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initializing IDF object.. try to read from file first', log_list=self.logs_training) # Try to read from file df_eidf_file = eidf.Eidf.read_eidf_from_storage( dir_path_model=self.dir_path_model, identifier_string=self.identifier_string, x_name=x_name, log_training=self.logs_training) Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successfully Read EIDF from file', log_list=self.logs_training) self.model_data.idf = np.array( df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF]) except Exception as ex_eidf: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': No EIDF from file available. Exception ' + str(ex_eidf) Log.critical(errmsg, log_list=self.logs_training) raise Exception(errmsg) # Standardize to at least 2-dimensional, easier when weighting x self.model_data.idf = npUtil.NumpyUtil.convert_dimension( arr=self.model_data.idf, to_dim=2) # # Combines # self.model_data.load_model_from_partial_trainings_data( td_latest=self.training_data, log_training=self.logs_training) return self.logs_training
def train( self, X, Y ): # Defining the size of the embedding embed_size = 2 # Defining the neural network inp = Input(shape=(X.shape[1],)) Log.debug('Input shape: ' + str(X.shape)) # Middle layer is the embedding vector we seek to extract # "linear" because this will serve as the word definition, to be input to other neural networks x = Dense(units=embed_size, activation='linear')(inp) # Standard softmax final layer x = Dense(units=Y.shape[1], activation='softmax')(x) model = Model(inputs=inp, outputs=x) Log.debug('Output shape: ' + str(Y.shape)) model.compile(loss='categorical_crossentropy', optimizer='adam') model.summary() # Optimizing the network weights model.fit( x=X, y=Y, batch_size=256, epochs=100 ) # Obtaining the weights from the neural network. # These are the so called word embeddings # The input layer (embedding weights) weights = model.get_weights()[0] Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Weights extracted as embedding layer: ' + str(weights) ) print(len(weights)) # Creating a dictionary to store the embeddings in. The key is a unique word and # the value is the numeric vector embedding_dict = {} for word in self.word_index_dict.keys(): embedding_dict.update({ word: weights[self.word_index_dict.get(word)] }) return embedding_dict
def convert_ascii_string_to_other_alphabet( ascii_char_string, # Default to CJK Unicode Block unicode_range=BLOCK_CHINESE, # If the characters come from a hexdigest from a hash, we can compress 4 times, # otherwise for a random ascii string, we can only compress 2 characters to 1 chinese. group_n_char=2): uni_len = unicode_range[1] - unicode_range[0] + 1 r = len(ascii_char_string) % 4 if r != 0: # Append 0's ascii_char_string = ascii_char_string + '0' * (4 - r) # raise Exception('Hash length ' + str(len(hash_hex_string)) # + ' for "' + str(hash_hex_string) + '" not 0 modulo-4') hash_zh = '' len_block = int(len(ascii_char_string) / group_n_char) for i in range(0, len_block, 1): idx_start = group_n_char * i idx_end = idx_start + group_n_char s = ascii_char_string[idx_start:idx_end] # Convert to Chinese, Korean, etc if group_n_char == 2: ord_arr = np.array([ord(x) for x in s]) val = ord_arr * np.array( [2**(8 * (x - 1)) for x in range(len(ord_arr), 0, -1)]) val = np.sum(val) Log.debug('Index start=' + str(idx_start) + ', end=' + str(idx_end) + ', s=' + str(s) + ', ordinal=' + str(ord_arr) + ', val=' + str(hex(val))) cjk_unicode = (val % uni_len) + unicode_range[0] hash_zh += chr(cjk_unicode) elif group_n_char == 4: Log.debug('Index start=' + str(idx_start) + ', end=' + str(idx_end) + ', s=' + str(s)) n = int('0x' + str(s), 16) cjk_unicode = (n % uni_len) + unicode_range[0] hash_zh += chr(cjk_unicode) Log.debugdebug('From ' + str(idx_start) + ': ' + str(s) + ', n=' + str(n) + ', char=' + str(chr(cjk_unicode))) return hash_zh
def __remove_stopwords( self, word_list ): if self.stopwords_list: word_list_remove = [] for w in word_list: if w not in self.stopwords_list: word_list_remove.append(w) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Word list "' + str(word_list) + '", removed stopwords to "' + str(word_list_remove) + '".' ) return word_list_remove else: return word_list
def run_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) test_data = [ # 0: words to compare, 1: expected dist using Damerau-Levenshtein, 2: expected distance using Levenshtein (('เงน', 'เงิน'), 1, 1), (('ถนอ', 'ถอน'), 1, 2), (('ธรรมะ', 'ธรา'), 3, 3), # For Lev, 3 edit distance by # 1. Deleting 'ธ' to get 'รรมะ' # 2. Inserting 'ธ' to get 'รธมะ' # 3. Replacing 'ะ' with 'ร' to get 'รธมร' (('ธรรมะ', 'รธมร'), 3, 3), ] test_algos = [ EditDistance.EDIT_DIST_ALGO_DAMLEV, EditDistance.EDIT_DIST_ALGO_LEV ] for use_numpy in [True, False]: for i in range(len(test_data)): word1, word2 = test_data[i][0] expected_dist = [test_data[i][1], test_data[i][2]] for j_algo in range(len(test_algos)): algo = test_algos[j_algo] start = time.time() retc = EditDistance(algo=algo).calculate( word_1=word1, word_2=word2, use_np=use_numpy, ) dist = retc.optimal_cost end = time.time() Log.debug('Calculated distance: ' + str(dist)) Log.debug("Search took " + str(round(1000 * (end - start), 2)) + 'ms.') res_final.update_bool(res_bool=ut.UnitTest.assert_true( observed=dist, expected=expected_dist[j_algo], test_comment='numpy= ' + str(use_numpy) + ', test word ' + str(i) + ' "' + str(word1) + '" and "' + str(word2) + '"')) return res_final
def run_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) s = '니는 먹고 싶어' tests_set_1 = [[Hash.ALGO_SHA1, '蔮膫圈嫩慁覕邜蹋妡狿'], [Hash.ALGO_SHA256, '葶杊閹翔綐僤徼戻髯鼚胦嘭藃诠灑浽'], [Hash.ALGO_SHA512, '詐鏙仟墍例嵝烐檦蝡溲薑珇鸦東燢爻纷欜陲囚劚攠菜槑茹輀濯偑袁蓣质簨'], [Hash.ALGO_SHA3_256, '厥驹踸鸨揱澯鑢擠鳰僸覑儽悃徵絨控'], [ Hash.ALGO_SHA3_512, '醜怅僒础衺菼惓隔鮚腋釔晞鏙屜咖龩檵因伖蘦惌灱騾凊纅弪鮾蕏解铦欪臓' ]] for x in tests_set_1: algo = x[0] expected = x[1] # In Linux command line, echo -n "$s" | shasum -a 1 (or 256,512) Log.debug('Using algo "' + str(algo) + '":') hstr = Hash.hash(string=s, algo=algo) Log.debug('Hash: ' + str(hstr)) observed = Hash.convert_ascii_string_to_other_alphabet( ascii_char_string=hstr, # unicode_range = Hash.BLOCK_KOREAN_SYL, group_n_char=4) res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=observed, expected=expected, test_comment='test string "' + str(hstr) + '" got "' + str(observed) + '"')) tests_set_2 = [['abc/ii{}.!&%[][\\+=', '嵢弯敩睽簡琥坝坜礽縰'], ['8829amsf)(*&^%^*./', '蘸耹嵭潦眨砦娥娪簯縰']] for x in tests_set_2: ascii_string = x[0] expected = x[1] observed = Hash.convert_ascii_string_to_other_alphabet( ascii_char_string=ascii_string) res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=observed, expected=expected, test_comment='test string "' + str(ascii_string) + '" got "' + str(observed) + '"')) return res_final
def __group_case_endings_by_len(self, endings_list, part_of_speech): endings_by_len = {} maxlen = 0 for s in endings_list: maxlen = max(maxlen, len(s)) # Longest to shortest for i in range(maxlen, 0, -1): endings_by_len[i] = [] # Put them in the groups for s in endings_list: endings_by_len[len(s)].append(s) Log.debug( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': ' + str(part_of_speech) + ' case endings by length: ' + str(endings_by_len)) return endings_by_len
def build_tree_roots(self): # Find root tree nodes self.tree_roots = {} for name in self.tree_nodes.keys(): Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Checking if ' + str(name) + ' is a tree root...' ) node = self.tree_nodes[name] if not node.is_dead_node(): if node.is_tree_root(): self.tree_roots[name] = node self.tree_roots_depth[name] = self.calculate_tree_depth(node=node) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Found ' + str(len(self.tree_roots)) + ' tree roots' ) return
def segment_ko_ja( self, text, return_array_of_split_words = False ): try: if self.lang in [lf.LangFeatures.LANG_JA]: words_postags = nagisa.tagging(text) txt_sym_tok = words_postags.words txt_sym_postags = words_postags.postags Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Japanese segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags) ) if return_array_of_split_words: return txt_sym_tok else: return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok) elif self.lang in [lf.LangFeatures.LANG_KO]: self.warn_korean() words_postags = self.kkma.pos( phrase = text ) txt_sym_tok = [wp[0] for wp in words_postags] txt_sym_postags = [wp[1] for wp in words_postags] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Korean segmentation "' + str(txt_sym_tok) + '", word & POS tags: ' + str(words_postags) ) if return_array_of_split_words: return txt_sym_tok else: return BasicPreprocessor.get_word_separator(lang=self.lang).join(txt_sym_tok) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No external library supported for language "' + str(self.lang) + '"' ) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error segmenting lang "' + str(self.lang) + '", text "' + str(text) \ + '", exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg)
def is_higher_level(self, node, supposed_child_node): Log.debug( '***** check if "' + str(supposed_child_node.name) + '" is higher level than "' + str(node.name) + '", parents: ' + str(node.parent_names) ) if supposed_child_node.name in node.parent_names: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Node "' + str(self.name) + '" cannot add "' + str(supposed_child_node.name) + '" as child. Node "' + str(supposed_child_node.name) + '" is already a higher level parent node to "' + str(self.name) + '"' ) return True for par in node.parents: if self.is_higher_level(node=par, supposed_child_node=supposed_child_node): return True else: continue return False
def get_stats_lang_detect( self, sentences_list, langs_real, langs_detected, ): correct_count = 0 total_count = len(langs_real) for i in range(total_count): lang_det = langs_detected[i] lang_real = langs_real[i] correct_result = lang_real == lang_det if not correct_result: Log.debug('Detected "' + str(lang_det) + '" for supposed "' + str(lang_real) + '" sent "' + str(sentences_list[i]) + '"') correct_count += 1 * (correct_result) correct_pct = round(100 * correct_count / total_count, 2) return correct_pct, correct_count, total_count