def _align_source_target(self, un_match, un_pos, position, tgt_text, tgt_pos, align_features): tgt_dic = {} # list of pairs of words tgt_word_pos = TMUtilsMatching.segment_2_universal( tgt_text.lower(), tgt_pos, self.tgt_lang) for i in range(0, len(tgt_word_pos)): value_similarity = 0 for f in align_features: if f == 'word_ter': # TER between words value_similarity = value_similarity + TMUtilsMatching.ter_distance( un_match, tgt_word_pos[i][0]) if f == 'posTag': # Boolean PosTag value_similarity = value_similarity + TMUtilsMatching.pos_bool( un_pos, tgt_word_pos[i][1]) if f == 'position': # Word position value_similarity = value_similarity + TMUtilsMatching.position_distance( position, i) #if f == 'frequency': # frequency of pairs of words # value_similarity = value_similarity + self.target_importance(un_word, tgt_word_pos[i][0], segment,best_segments) # Dictionary have the target word and the position of the word in the target sentence tgt_dic[(tgt_word_pos[i][0], i)] = value_similarity tgt_align = sorted(tgt_dic.items(), key=lambda item: item[1], reverse=True)[0] # Select the highest score return tgt_align[0][0], tgt_align[0][ 1] # un_word, un_position # Retorn the word with biggest score
def _align_source_target(self, un_match, un_pos, position, tgt_word_pos, align_features): #tgt_text, tgt_pos, related_words = [] tgt_dic = {} # list of pairs of words equal_posTag = [[ position_tgt, word, pos ] for position_tgt, [word, pos] in list(enumerate(tgt_word_pos)) if pos == un_pos.strip(' ') or pos == 'VERB' or pos == 'NOUN' or pos == 'ADJ'] #print('*************') #print(equal_posTag) if not equal_posTag: return None, None else: if 'glossary' in align_features: related_words = self.search_exact_value(un_match, 10) for i in range(0, len(equal_posTag)): value_similarity = 0 for f in align_features: if f == 'word_ter': # TER between words value_similarity = value_similarity + ( 0.25 * TMUtilsMatching.un_match_distance( un_match, equal_posTag[i][1])) if f == 'posTag': # Boolean PosTag value_similarity = value_similarity + ( 0.25 * TMUtilsMatching.pos_bool( un_pos, equal_posTag[i][2])) if f == 'position': # Word position value_similarity = value_similarity + ( 0.25 * TMUtilsMatching.position_distance( position, equal_posTag[i][0])) if f == 'glossary': # search word on elasticTM if equal_posTag[i][1] in related_words: is_related = 1 else: is_related = 0 value_similarity = value_similarity + ( 0.25 * is_related ) #target_importance(un_word, tgt_word_pos[i][0], segment,best_segments) # Dictionary have the target word and the position of the word in the target sentence --> Low is the best tgt_dic[(equal_posTag[i][1], equal_posTag[i][0])] = value_similarity tgt_align = sorted(tgt_dic.items(), key=lambda item: item[1], reverse=True)[0] # Select the highest score print( sorted(tgt_dic.items(), key=lambda item: item[1], reverse=True)) if tgt_align[1] > G_CONFIG.get_src_tgt_threshold(): return tgt_align[0][0], tgt_align[0][1] else: return None, None