def process(self, query_dic, src_text, tgt_text): #, src_pos, tgt_pos # pre-process and apply regex to tm_src src_re = self.re_pp[self.src_lang].process(src_text) if src_text != src_re: # Was applied regex in src tgt_re = self.re_pp[self.tgt_lang].process(tgt_text) # if query_dic['query_re'] == query_dic['tokenizer']: # Was not applied regular expression on query # if src_re is not None and src_pos is not None: # src_text, src_pos = TMRegexMatch._delete_elements(src_re.split(' '), src_pos.split(' ')) # if tgt_re is not None and tgt_pos is not None: # tgt_text, tgt_pos = TMRegexMatch._delete_elements(tgt_re.split(' '), tgt_pos.split(' ')) #ter = TMUtilsMatching._ter_score(query_dic['tokenizer'], src_text) #else: #Transform target into query ter = TMUtilsMatching._ter_score(query_dic['query_re'], src_re) #Extract patterns (find and replace) value src_query_f, src_query_r = TMRegexMatch._extract_find_replace( query_dic['tokenizer'].split(' '), query_dic['query_re'].split(' ')) tgt_query_f = src_query_f.copy() tgt_query_r = src_query_r.copy() src_f, src_r = TMRegexMatch._extract_find_replace( src_text.split(' '), src_re.split(' ')) ter = ter - len(src_f) src_text = TMRegexMatch._replace_values(src_query_f, src_query_r, src_re.split(' '), src_f, src_r) tgt_f, tgt_r = TMRegexMatch._extract_find_replace( tgt_text.split(' '), tgt_re.split(' ')) tgt_text = TMRegexMatch._replace_values(tgt_query_f, tgt_query_r, tgt_re.split(' '), tgt_f, tgt_r) else: ter = TMUtilsMatching._ter_score(query_dic['tokenizer'], src_text) #Regex did't applied return tgt_text, src_text, ter #, src_pos, tgt_pos
def _match_rank_concordance(self, best_segments): # , output self.timer.start("ter") l_ter_score = [ TMUtilsMatching._ter_score(self.query, segment[0].source_text) for segment in best_segments ] self.timer.stop("ter") l_best_sort = sorted(zip(best_segments, l_ter_score), key=operator.itemgetter(1), reverse=True) return [(segment[0][0], segment[1]) for segment in l_best_sort]
def execute(self, threshold, l_best_segments, match_process, align_features, concordance): #, output self.timer.start("preprocess") query_dic = self._preprocess( self.query, self.src_lang) # Tokenize, posTag and universal query string self.timer.stop("preprocess") if concordance: return self._match_rank_concordance(l_best_segments) else: rank_segments = self._match_rank(l_best_segments, threshold) trans_segments = [] # Check if the retrieve segments are 100% match or apply transformations for segment in rank_segments: #segment = segment[0] if segment.source_text == self.query: # 100% match --> Return match considering domain ter = 100 if self.query.isupper(): segment.source_text = segment.source_text.upper() if self.query.islower(): segment.source_text = segment.source_text.lower() #trans_segments.append((segment,ter)) else: #Pre-process source and target tgt_text = TMUtilsMatching.pre_process( segment.target_text, self.tgt_lang, 'tokenizer', {}) # Pre-process tgt src_text = TMUtilsMatching.pre_process( segment.source_text, self.src_lang, 'tokenizer', {}) # Tokenize tm_src if 'regex' in match_process: if (query_dic['tokenizer'] == query_dic['query_re']): ter = TMUtilsMatching._ter_score( query_dic['tokenizer'], src_text) # Regex did't applied on query else: self.timer.start("_regx_match") tgt_text, src_text, ter = self._regx_match( query_dic, src_text, tgt_text ) #, segment.source_pos, segment.target_pos self.timer.stop("_regx_match") logging.info( "Applied Regex Segment: {} {} {}".format( tgt_text, src_text, str(ter))) else: ter = TMUtilsMatching._ter_score( query_dic['tokenizer'], src_text) # Regex did't enter as a parameter if ter < threshold: logging.info("TER less threshold: {} ".format( str(ter))) continue if 'posTag' in match_process and ter != 100: #Check segments with only one difference if segment.source_pos is not None and segment.target_pos is not None: #This part need the pos tagger annotation self.timer.start("fuzzy_match") #target_word (to D, R, or I), target_position, operation(R I or D),src_un_match(some time have source or query information) tgt_word, tgt_position, operation, src_un_match, src_position = self._combine_feature_match( query_dic, tgt_text, src_text, segment.source_pos, segment.target_pos, align_features) logging.info("Un_match: {} {} ".format( tgt_word, operation)) if src_un_match is not None: src_text = self._create_target_expression( src_text, src_position, operation, src_un_match, 'source') #src_un_match, # src_text = src_text.split(' ') # if operation == 'R': # src_text[int(src_position.split(' _ ')[1])] = tgt_word # if operation == 'I': # new_src_text = src_text[:int(src_position)] + [src_un_match] + src_text[int(src_position):] # #new_src_text.append(src_un_match) # #new_src_text = new_src_text + src_text[int(src_position):] # src_text = new_src_text # if operation == 'D': # src_text.pop(int(src_position)) # src_text = ' '.join(src_text) if tgt_word is not None: tgt_text = self._create_target_expression( tgt_text, tgt_position, operation, src_un_match, 'target') #tgt_word, self.timer.stop("fuzzy_match") segment.source_text = TMUtilsMatching.pre_process( src_text.split(' '), self.src_lang, 'untokenizer', {}) segment.target_text = TMUtilsMatching.pre_process( tgt_text.split(' '), self.tgt_lang, 'untokenizer', {}) logging.info("Target segment: {}".format( segment.target_text)) if self.query.isupper(): segment.source_text = segment.source_text.upper() segment.target_text = segment.target_text.upper() if self.query.islower(): segment.source_text = segment.source_text.lower() segment.target_text = segment.target_text.lower() trans_segments.append((segment, ter)) return trans_segments