def _improve_match(self, query_info, operation): query_word = query_info.split(' _ ') if operation == 'R': #Estimate editD between the words return (TMUtilsMatching._edit_distance(query_word[0], query_word[1]) / 2) else: return (len(query_word[0]) / 2) # EditD is equal ao total de characters add or delete from the string
def _match_rank(self, best_segments, threshold): #, output segments = [] self.timer.start("ter") l_ter_score = [ TMUtilsMatching._edit_distance(self.query, segment[0].source_text) for segment in best_segments ] self.timer.stop("ter") l_best_sort = sorted(zip(best_segments, l_ter_score), key=operator.itemgetter(1), reverse=True) for segment, ter in l_best_sort: # TM output --> only show segments with ter > threshold if ter >= threshold - 10: segments.append((segment[0])) else: break return segments
def _tm_edit_distance(self, q_text, s_text, q_simplified, s_simplified): # Corner case - matching artificial empty segment -> giving minimal score if q_text and not s_text.strip(): return 1 #Always reduce the tags to count only one element ''' print('**original**') print(q_text) print('**src**') print(s_text) print('**originalS**') print(q_simplified) print('**srcS**') print(s_simplified) ''' # 1) ********** Obtain words and stop words sequences q_onlyW, q_st_word = TMMatching._only_word_sequence(q_text, self.src_lang) s_onlyW, s_st_word = TMMatching._only_word_sequence(s_text, self.src_lang) ''' print(q_onlyW) print(s_onlyW) print(q_st_word) print(s_st_word) ''' if not q_onlyW and not q_st_word: #print(self.src_lang) #if self.src_lang=='zh': editD = 100 - (TMUtilsMatching._edit_distance(q_text, s_text)) #* 100 else: # Normal editDistance, without puntuation marks and only word, without stop words nchar_diff = TMUtilsMatching._edit_distance(' '.join(q_onlyW), ' '.join(s_onlyW)) # Consider all the words, without any substitution #print(q_onlyW) #print(s_onlyW) nchar_len = len(' '.join(q_onlyW)) + len(' '.join(s_onlyW)) if nchar_len == 0: nchar_len = 1 #print(nchar_len) char_diff = (2*nchar_diff)/(nchar_len) # total of charaters # 2) ********* Simplified --> Convert to letter and keep only puntuation marks q_replaceW, q_onlyS = TMMatching._symbol_sequence(q_simplified) # Original query # Ex. '- 3.67 housing units constructed under the $ # home % ownership saving scheme in the Hanano/ and (Hamdaniya districts;' --> - N N N N N N $ # N % N N N N N N/ N (N N; s_replaceW, s_onlyS = TMMatching._symbol_sequence(s_simplified) # Original tm_src if (len(s_onlyS) == 0 and len(q_onlyS) == 0): # There are not symbol n_symbol_diff = 0 else: n_symbol_diff = TMUtilsMatching._edit_distance(q_replaceW, s_replaceW) #(' '.join(q_onlyS), ' '.join(s_onlyS))/2# len_symbols = len(q_replaceW.split(' ')) + len(q_replaceW.split(' ')) # len(q_onlyS) + len(s_onlyS) if len_symbols == 0: len_symbols = 1 symbol_diff = (2*n_symbol_diff)/len_symbols # 3) ********* Exist or not exist the query words on source nword_diff = set(q_onlyW).difference(s_onlyW) # Replace regular expression by only one word onlyW_len = len(q_onlyW) if onlyW_len == 0: onlyW_len = 1 word_diff = (len(nword_diff))/onlyW_len # only query words # 4) ********* Stop words stop_words = True if (len(q_st_word) == 0 and len(s_st_word) == 0): # There are not stop word or this language doesn't have stop words list stop_words = False if stop_words: n_st_diff = TMUtilsMatching._edit_distance(' '.join(q_st_word), ' '.join(s_st_word)) len_stop_word = len(' '.join(q_st_word)) + len(' '.join(s_st_word)) stop_word_diff = (2 * n_st_diff)/len_stop_word editD = (1 - ((0.70 * (char_diff)) + (0.10 * (word_diff)) + (0.10 * (symbol_diff)) + (0.10 * (stop_word_diff)))) * 100 else: editD = (1 - ((0.70 * (char_diff)) + (0.15 * (word_diff)) + (0.15 * (symbol_diff)))) * 100 if editD < 0: editD = 10 return int(math.floor(editD))